Prepare data of Analysis, Treat Missing Data, Wrong Data, Outlier Data¶

Import Libraries¶

In [ ]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,recall_score,f1_score,precision_score

# import tensorflow as tf
# import keras

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

Read The Datasets¶

In [ ]:
df_attr = pd.read_csv(r'G:\ETLHive Pune\Project\HR-Data Atrition Project\HR-Employee-Attrition-Table 1.csv')
In [ ]:
pd.options.display.max_columns=200
In [ ]:
df_attr.head()
Out[ ]:
Attrition Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 2 Female 94 3 2 Sales Executive 4 Single 5993 19479 8 Y Yes 11 3 1 80 0 8 0 1 6 4 0 5
1 0 49 Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 3 Male 61 2 2 Research Scientist 2 Married 5130 24907 1 Y No 23 4 4 80 1 10 3 3 10 7 1 7
2 1 37 Travel_Rarely 1373 Research & Development 2 2 Other 1 4 4 Male 92 2 1 Laboratory Technician 3 Single 2090 2396 6 Y Yes 15 3 2 80 0 7 3 3 0 0 0 0
3 0 33 Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 4 Female 56 3 1 Research Scientist 3 Married 2909 23159 1 Y Yes 11 3 3 80 0 8 3 3 8 7 3 0
4 0 27 Travel_Rarely 591 Research & Development 2 1 Medical 1 7 1 Male 40 3 1 Laboratory Technician 2 Married 3468 16632 9 Y No 12 3 4 80 1 6 3 3 2 2 2 2
In [ ]:
# pd.options.display.max_columns=200
In [ ]:
df_attr.shape
Out[ ]:
(1470, 35)

Data Visulization¶

In [ ]:
# Attrition

print(pd.DataFrame(df_attr['Attrition'].value_counts()))
print('----------------------------------------------')

print('Yes      :', round((237/(1233+237)*100),2),'%')
print('No       :', round((1233/(1233+237)*100),2),'%')

sns.countplot(x=df_attr['Attrition'])
   Attrition
0       1233
1        237
----------------------------------------------
Yes      : 16.12 %
No       : 83.88 %
Out[ ]:
<AxesSubplot: xlabel='Attrition', ylabel='count'>
In [ ]:
# Age
plt.figure(figsize=(20,6))
sns.countplot(x=df_attr['Age'],hue = df_attr['Attrition'])
Out[ ]:
<AxesSubplot: xlabel='Age', ylabel='count'>
In [ ]:
# Business Travel

print(pd.DataFrame(df_attr['BusinessTravel'].value_counts()))
print('----------------------------------------------------')

print('BusinessTravel     :    ', round(1043/(1043+277+150)*100,2),'%')
print('Travel_Frequently  :    ', round(277/(1043+277+150)*100,2),'%')
print('Non-Travel         :    ', round(150/(1043+277+150)*100,2),'%')

sns.countplot(x=df_attr['BusinessTravel'], hue = df_attr['Attrition'])
                   BusinessTravel
Travel_Rarely                1043
Travel_Frequently             277
Non-Travel                    150
----------------------------------------------------
BusinessTravel     :     70.95 %
Travel_Frequently  :     18.84 %
Non-Travel         :     10.2 %
Out[ ]:
<AxesSubplot: xlabel='BusinessTravel', ylabel='count'>
In [ ]:
# Department
plt.figure(figsize=(8,4))

print(pd.DataFrame(df_attr['Department'].value_counts()))
print('----------------------------------------------')

print(pd.DataFrame(df_attr['Department'].value_counts(normalize=True)))

sns.countplot(x=df_attr['Department'], hue = df_attr['Attrition'])
                        Department
Research & Development         961
Sales                          446
Human Resources                 63
----------------------------------------------
                        Department
Research & Development    0.653741
Sales                     0.303401
Human Resources           0.042857
Out[ ]:
<AxesSubplot: xlabel='Department', ylabel='count'>
In [ ]:
# EducationField
plt.figure(figsize=(10,4))

print(pd.DataFrame(df_attr['EducationField'].value_counts()))
print('--------------------------------------------------')

print(pd.DataFrame(df_attr['EducationField'].value_counts(normalize=True)))

sns.countplot(x=df_attr['EducationField'], hue = df_attr['Attrition'])
                  EducationField
Life Sciences                606
Medical                      464
Marketing                    159
Technical Degree             132
Other                         82
Human Resources               27
--------------------------------------------------
                  EducationField
Life Sciences           0.412245
Medical                 0.315646
Marketing               0.108163
Technical Degree        0.089796
Other                   0.055782
Human Resources         0.018367
Out[ ]:
<AxesSubplot: xlabel='EducationField', ylabel='count'>
In [ ]:
# Gender

print(pd.DataFrame(df_attr['Gender'].value_counts()))
print('------------------------------------------')
print(pd.DataFrame(df_attr['Gender'].value_counts(normalize=True)))
sns.countplot(x=df_attr['Gender'], hue = df_attr['Attrition'])
        Gender
Male       882
Female     588
------------------------------------------
        Gender
Male       0.6
Female     0.4
Out[ ]:
<AxesSubplot: xlabel='Gender', ylabel='count'>
In [ ]:
# MaritalStatus

print(pd.DataFrame(df_attr['MaritalStatus'].value_counts()))
print('------------------------------------------------')

print(pd.DataFrame(df_attr['MaritalStatus'].value_counts(normalize=True)))

sns.countplot(x=df_attr['MaritalStatus'],hue=df_attr['Attrition'])
          MaritalStatus
Married             673
Single              470
Divorced            327
------------------------------------------------
          MaritalStatus
Married        0.457823
Single         0.319728
Divorced       0.222449
Out[ ]:
<AxesSubplot: xlabel='MaritalStatus', ylabel='count'>
In [ ]:
# Overtime

print(pd.DataFrame(df_attr['OverTime'].value_counts()))
print('-----------------------------------------')

print(pd.DataFrame(df_attr['OverTime'].value_counts(normalize=True)))

sns.countplot(x = df_attr['OverTime'], hue = df_attr['Attrition'])
     OverTime
No       1054
Yes       416
-----------------------------------------
     OverTime
No   0.717007
Yes  0.282993
Out[ ]:
<AxesSubplot: xlabel='OverTime', ylabel='count'>
In [ ]:
df_attr.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Attrition                 1470 non-null   int64 
 1   Age                       1470 non-null   int64 
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                  1470 non-null   int64 
 15  JobRole                   1470 non-null   object
 16  JobSatisfaction           1470 non-null   int64 
 17  MaritalStatus             1470 non-null   object
 18  MonthlyIncome             1470 non-null   int64 
 19  MonthlyRate               1470 non-null   int64 
 20  NumCompaniesWorked        1470 non-null   int64 
 21  Over18                    1470 non-null   object
 22  OverTime                  1470 non-null   object
 23  PercentSalaryHike         1470 non-null   int64 
 24  PerformanceRating         1470 non-null   int64 
 25  RelationshipSatisfaction  1470 non-null   int64 
 26  StandardHours             1470 non-null   int64 
 27  StockOptionLevel          1470 non-null   int64 
 28  TotalWorkingYears         1470 non-null   int64 
 29  TrainingTimesLastYear     1470 non-null   int64 
 30  WorkLifeBalance           1470 non-null   int64 
 31  YearsAtCompany            1470 non-null   int64 
 32  YearsInCurrentRole        1470 non-null   int64 
 33  YearsSinceLastPromotion   1470 non-null   int64 
 34  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(27), object(8)
memory usage: 402.1+ KB

Missing Data Treatment¶

In [ ]:
df_attr.isna().sum()
Out[ ]:
Attrition                   0
Age                         0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSinceLastPromotion     0
YearsWithCurrManager        0
dtype: int64

Seprate Categorical and Continiues columns¶

In [ ]:
cat = []
con = []
for i in df_attr.columns:
    if (df_attr[i].dtypes == "object"):
        cat.append(i)
    else:
        con.append(i)
In [ ]:
cat
Out[ ]:
['BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus',
 'Over18',
 'OverTime']
In [ ]:
con
Out[ ]:
['Attrition',
 'Age',
 'DailyRate',
 'DistanceFromHome',
 'Education',
 'EmployeeCount',
 'EmployeeNumber',
 'EnvironmentSatisfaction',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobSatisfaction',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StandardHours',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']
In [ ]:
df_train_cat = cat
df_train_cat = df_attr[cat]
df_train_cat
Out[ ]:
BusinessTravel Department EducationField Gender JobRole MaritalStatus Over18 OverTime
0 Travel_Rarely Sales Life Sciences Female Sales Executive Single Y Yes
1 Travel_Frequently Research & Development Life Sciences Male Research Scientist Married Y No
2 Travel_Rarely Research & Development Other Male Laboratory Technician Single Y Yes
3 Travel_Frequently Research & Development Life Sciences Female Research Scientist Married Y Yes
4 Travel_Rarely Research & Development Medical Male Laboratory Technician Married Y No
... ... ... ... ... ... ... ... ...
1465 Travel_Frequently Research & Development Medical Male Laboratory Technician Married Y No
1466 Travel_Rarely Research & Development Medical Male Healthcare Representative Married Y No
1467 Travel_Rarely Research & Development Life Sciences Male Manufacturing Director Married Y Yes
1468 Travel_Frequently Sales Medical Male Sales Executive Married Y No
1469 Travel_Rarely Research & Development Medical Male Laboratory Technician Married Y No

1470 rows × 8 columns

In [ ]:
# Visulization of categorical columns
In [ ]:
plt.figure(figsize=(15,19))
for x1, i in enumerate(df_train_cat.columns):
    if df_train_cat[i].dtypes=='object':
        plt.subplot(3,3,x1+1)
        sns.countplot(x=df_train_cat[i])
In [ ]:
df_train_con = con
df_train_con = df_attr[con]
df_train_con
Out[ ]:
Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 1102 1 2 1 1 2 94 3 2 4 5993 19479 8 11 3 1 80 0 8 0 1 6 4 0 5
1 0 49 279 8 1 1 2 3 61 2 2 2 5130 24907 1 23 4 4 80 1 10 3 3 10 7 1 7
2 1 37 1373 2 2 1 4 4 92 2 1 3 2090 2396 6 15 3 2 80 0 7 3 3 0 0 0 0
3 0 33 1392 3 4 1 5 4 56 3 1 3 2909 23159 1 11 3 3 80 0 8 3 3 8 7 3 0
4 0 27 591 2 1 1 7 1 40 3 1 2 3468 16632 9 12 3 4 80 1 6 3 3 2 2 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 0 36 884 23 2 1 2061 3 41 4 2 4 2571 12290 4 17 3 3 80 1 17 3 3 5 2 0 3
1466 0 39 613 6 1 1 2062 4 42 2 3 1 9991 21457 4 15 3 1 80 1 9 5 3 7 7 1 7
1467 0 27 155 4 3 1 2064 2 87 4 2 2 6142 5174 1 20 4 2 80 1 6 0 3 6 2 0 3
1468 0 49 1023 2 3 1 2065 4 63 2 2 2 5390 13243 2 14 3 4 80 0 17 3 2 9 6 0 8
1469 0 34 628 8 3 1 2068 2 82 4 2 3 4404 10228 2 12 3 1 80 0 6 3 4 4 3 1 2

1470 rows × 27 columns

In [ ]:
# Visualisation of continious columns
In [ ]:
plt.figure(figsize=(15,19))
for x1, i in enumerate(df_train_con.columns):
    if df_train_con[i].dtypes=='int64' or df_train_con[i].dtypes=='float64':
        plt.subplot(9,3,x1+1)
        sns.boxplot(df_train_con[i])

Removing Outliers¶

In [ ]:
for i in df_train_con.columns:
    q1 = df_train_con[i].quantile(0.25)
    q3 = df_train_con[i].quantile(0.75)
    IQR = q3-q1
    uppertail = q3+1.5*IQR
    lowertail = q1-1.5*IQR
    df_train_con.loc[(df_train_con[i]>uppertail) | (df_train_con[i]<lowertail)]
    mean_1 = df_train_con[i].mean()
    df_train_con.loc[(df_train_con[i]>uppertail) | (df_train_con[i]<lowertail),i]=mean_1
In [ ]:
plt.figure(figsize=(15,19))
for x1, i in enumerate(df_train_con.columns):
    if df_train_con[i].dtypes=='int64' or df_train_con[i].dtypes=='float64':
        plt.subplot(9,3,x1+1)
        sns.boxplot(df_train_con[i])

Checking Skewness¶

In [ ]:
df_attr.skew().sort_values()
Out[ ]:
WorkLifeBalance            -0.552480
JobInvolvement             -0.498419
JobSatisfaction            -0.329672
EnvironmentSatisfaction    -0.321654
RelationshipSatisfaction   -0.302828
Education                  -0.289681
HourlyRate                 -0.032311
DailyRate                  -0.003519
EmployeeCount               0.000000
StandardHours               0.000000
EmployeeNumber              0.016574
MonthlyRate                 0.018578
Age                         0.413286
TrainingTimesLastYear       0.553124
PercentSalaryHike           0.821128
YearsWithCurrManager        0.833451
YearsInCurrentRole          0.917363
DistanceFromHome            0.958118
StockOptionLevel            0.968980
JobLevel                    1.025401
NumCompaniesWorked          1.026471
TotalWorkingYears           1.117172
MonthlyIncome               1.369817
YearsAtCompany              1.764529
Attrition                   1.844366
PerformanceRating           1.921883
YearsSinceLastPromotion     1.984290
dtype: float64

Check Correlation¶

In [ ]:
plt.figure(figsize=(20,20))
sns.heatmap(df_attr.corr(), annot=True)
Out[ ]:
<AxesSubplot: >

2. Problem Statement¶

In [ ]:
# Create new/derived predictors (e.g Age group) for analysis
In [ ]:
df_attr.head()
Out[ ]:
Attrition Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 2 Female 94 3 2 Sales Executive 4 Single 5993 19479 8 Y Yes 11 3 1 80 0 8 0 1 6 4 0 5
1 0 49 Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 3 Male 61 2 2 Research Scientist 2 Married 5130 24907 1 Y No 23 4 4 80 1 10 3 3 10 7 1 7
2 1 37 Travel_Rarely 1373 Research & Development 2 2 Other 1 4 4 Male 92 2 1 Laboratory Technician 3 Single 2090 2396 6 Y Yes 15 3 2 80 0 7 3 3 0 0 0 0
3 0 33 Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 4 Female 56 3 1 Research Scientist 3 Married 2909 23159 1 Y Yes 11 3 3 80 0 8 3 3 8 7 3 0
4 0 27 Travel_Rarely 591 Research & Development 2 1 Medical 1 7 1 Male 40 3 1 Laboratory Technician 2 Married 3468 16632 9 Y No 12 3 4 80 1 6 3 3 2 2 2 2
In [ ]:
df_attr['Age'].unique()
Out[ ]:
array([41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 29, 31, 34, 28, 22, 53,
       24, 21, 42, 44, 46, 39, 43, 50, 26, 48, 55, 45, 56, 23, 51, 40, 54,
       58, 20, 25, 19, 57, 52, 47, 18, 60], dtype=int64)
In [ ]:
df_attr['Age'].value_counts()
Out[ ]:
35    78
34    77
31    69
36    69
29    68
32    61
30    60
33    58
38    58
40    57
37    50
27    48
28    48
42    46
39    42
45    41
41    40
26    39
46    33
44    33
43    32
50    30
24    26
25    26
47    24
49    24
55    22
48    19
51    19
53    19
52    18
54    18
22    16
56    14
58    14
23    14
21    13
20    11
59    10
19     9
18     8
60     5
57     4
Name: Age, dtype: int64
In [ ]:
df_attr['Age'].max()
Out[ ]:
60
In [ ]:
T = []
for i in df_attr['Age']:
    if(i>15 and i<=30):
        T.append('GROUP1')
        
    elif(i>30 and i<=45):
        T.append('GROUP2')
        
    else:
        T.append('GROUP3')
In [ ]:
df_attr['AGE_GROUP']=T
In [ ]:
df_attr.head()
Out[ ]:
Attrition Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager AGE_GROUP
0 1 41 Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 2 Female 94 3 2 Sales Executive 4 Single 5993 19479 8 Y Yes 11 3 1 80 0 8 0 1 6 4 0 5 GROUP2
1 0 49 Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 3 Male 61 2 2 Research Scientist 2 Married 5130 24907 1 Y No 23 4 4 80 1 10 3 3 10 7 1 7 GROUP3
2 1 37 Travel_Rarely 1373 Research & Development 2 2 Other 1 4 4 Male 92 2 1 Laboratory Technician 3 Single 2090 2396 6 Y Yes 15 3 2 80 0 7 3 3 0 0 0 0 GROUP2
3 0 33 Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 4 Female 56 3 1 Research Scientist 3 Married 2909 23159 1 Y Yes 11 3 3 80 0 8 3 3 8 7 3 0 GROUP2
4 0 27 Travel_Rarely 591 Research & Development 2 1 Medical 1 7 1 Male 40 3 1 Laboratory Technician 2 Married 3468 16632 9 Y No 12 3 4 80 1 6 3 3 2 2 2 2 GROUP1

3. Problem Statement¶

In [ ]:
# 3. Explore the Data using Exploratory Data Analysis - for Y and all Xs
In [ ]:
df_attr3 = pd.read_csv(r'G:\ETLHive Pune\Project\HR-Data Atrition Project\HR-Employee-Attrition-Table 1.csv')
df_attr3.head()
Out[ ]:
Attrition Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 2 Female 94 3 2 Sales Executive 4 Single 5993 19479 8 Y Yes 11 3 1 80 0 8 0 1 6 4 0 5
1 0 49 Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 3 Male 61 2 2 Research Scientist 2 Married 5130 24907 1 Y No 23 4 4 80 1 10 3 3 10 7 1 7
2 1 37 Travel_Rarely 1373 Research & Development 2 2 Other 1 4 4 Male 92 2 1 Laboratory Technician 3 Single 2090 2396 6 Y Yes 15 3 2 80 0 7 3 3 0 0 0 0
3 0 33 Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 4 Female 56 3 1 Research Scientist 3 Married 2909 23159 1 Y Yes 11 3 3 80 0 8 3 3 8 7 3 0
4 0 27 Travel_Rarely 591 Research & Development 2 1 Medical 1 7 1 Male 40 3 1 Laboratory Technician 2 Married 3468 16632 9 Y No 12 3 4 80 1 6 3 3 2 2 2 2
In [ ]:
# Seprate Categorical and continious columns
In [ ]:
cat = []
con = []
for i in df_attr3.columns:
    if (df_attr3[i].dtypes == 'object'):
        cat.append(i)
    else:
        con.append(i)
In [ ]:
df_attr3_cat = cat
df_attr3_cat = df_attr3[cat]
df_attr3_cat
Out[ ]:
BusinessTravel Department EducationField Gender JobRole MaritalStatus Over18 OverTime
0 Travel_Rarely Sales Life Sciences Female Sales Executive Single Y Yes
1 Travel_Frequently Research & Development Life Sciences Male Research Scientist Married Y No
2 Travel_Rarely Research & Development Other Male Laboratory Technician Single Y Yes
3 Travel_Frequently Research & Development Life Sciences Female Research Scientist Married Y Yes
4 Travel_Rarely Research & Development Medical Male Laboratory Technician Married Y No
... ... ... ... ... ... ... ... ...
1465 Travel_Frequently Research & Development Medical Male Laboratory Technician Married Y No
1466 Travel_Rarely Research & Development Medical Male Healthcare Representative Married Y No
1467 Travel_Rarely Research & Development Life Sciences Male Manufacturing Director Married Y Yes
1468 Travel_Frequently Sales Medical Male Sales Executive Married Y No
1469 Travel_Rarely Research & Development Medical Male Laboratory Technician Married Y No

1470 rows × 8 columns

In [ ]:
df_attr3_con = con
df_attr3_con = df_attr3[con]
df_attr3_con
Out[ ]:
Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 1102 1 2 1 1 2 94 3 2 4 5993 19479 8 11 3 1 80 0 8 0 1 6 4 0 5
1 0 49 279 8 1 1 2 3 61 2 2 2 5130 24907 1 23 4 4 80 1 10 3 3 10 7 1 7
2 1 37 1373 2 2 1 4 4 92 2 1 3 2090 2396 6 15 3 2 80 0 7 3 3 0 0 0 0
3 0 33 1392 3 4 1 5 4 56 3 1 3 2909 23159 1 11 3 3 80 0 8 3 3 8 7 3 0
4 0 27 591 2 1 1 7 1 40 3 1 2 3468 16632 9 12 3 4 80 1 6 3 3 2 2 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 0 36 884 23 2 1 2061 3 41 4 2 4 2571 12290 4 17 3 3 80 1 17 3 3 5 2 0 3
1466 0 39 613 6 1 1 2062 4 42 2 3 1 9991 21457 4 15 3 1 80 1 9 5 3 7 7 1 7
1467 0 27 155 4 3 1 2064 2 87 4 2 2 6142 5174 1 20 4 2 80 1 6 0 3 6 2 0 3
1468 0 49 1023 2 3 1 2065 4 63 2 2 2 5390 13243 2 14 3 4 80 0 17 3 2 9 6 0 8
1469 0 34 628 8 3 1 2068 2 82 4 2 3 4404 10228 2 12 3 1 80 0 6 3 4 4 3 1 2

1470 rows × 27 columns

In [ ]:
# Visualisation of categorical columns
In [ ]:
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_attr3_cat.columns):
    if df_attr3_cat[i].dtypes == 'object':
        plt.subplot(3,3,x1+1)
        sns.countplot(x=df_attr3_cat[i])
In [ ]:
# Visulisation of continious columns
In [ ]:
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_attr3_con.columns):
    if df_attr3_con[i].dtypes == 'int64' or df_attr3_con[i].dtypes == 'float64':
        plt.subplot(9,3,x1+1)
        sns.boxplot(df_attr3_con[i])

Removing Outliers¶

In [ ]:
df_attr3.describe()
Out[ ]:
Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
count 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.0 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.0 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000
mean 0.161224 36.923810 802.485714 9.192517 2.912925 1.0 1024.865306 2.721769 65.891156 2.729932 2.063946 2.728571 6502.931293 14313.103401 2.693197 15.209524 3.153741 2.712245 80.0 0.793878 11.279592 2.799320 2.761224 7.008163 4.229252 2.187755 4.123129
std 0.367863 9.135373 403.509100 8.106864 1.024165 0.0 602.024335 1.093082 20.329428 0.711561 1.106940 1.102846 4707.956783 7117.786044 2.498009 3.659938 0.360824 1.081209 0.0 0.852077 7.780782 1.289271 0.706476 6.126525 3.623137 3.222430 3.568136
min 0.000000 18.000000 102.000000 1.000000 1.000000 1.0 1.000000 1.000000 30.000000 1.000000 1.000000 1.000000 1009.000000 2094.000000 0.000000 11.000000 3.000000 1.000000 80.0 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 30.000000 465.000000 2.000000 2.000000 1.0 491.250000 2.000000 48.000000 2.000000 1.000000 2.000000 2911.000000 8047.000000 1.000000 12.000000 3.000000 2.000000 80.0 0.000000 6.000000 2.000000 2.000000 3.000000 2.000000 0.000000 2.000000
50% 0.000000 36.000000 802.000000 7.000000 3.000000 1.0 1020.500000 3.000000 66.000000 3.000000 2.000000 3.000000 4919.000000 14235.500000 2.000000 14.000000 3.000000 3.000000 80.0 1.000000 10.000000 3.000000 3.000000 5.000000 3.000000 1.000000 3.000000
75% 0.000000 43.000000 1157.000000 14.000000 4.000000 1.0 1555.750000 4.000000 83.750000 3.000000 3.000000 4.000000 8379.000000 20461.500000 4.000000 18.000000 3.000000 4.000000 80.0 1.000000 15.000000 3.000000 3.000000 9.000000 7.000000 3.000000 7.000000
max 1.000000 60.000000 1499.000000 29.000000 5.000000 1.0 2068.000000 4.000000 100.000000 4.000000 5.000000 4.000000 19999.000000 26999.000000 9.000000 25.000000 4.000000 4.000000 80.0 3.000000 40.000000 6.000000 4.000000 40.000000 18.000000 15.000000 17.000000
In [ ]:
for i in df_attr3_con.columns:
    q1 = df_attr3_con[i].quantile(0.25)
    q3 = df_attr3_con[i].quantile(0.75)
    IQR = q3-q1
    uppertail = q3+1.5*IQR
    lowertail = q1-1.5*IQR
    df_attr3_con.loc[(df_attr3_con[i]>uppertail) | (df_attr3_con[i]<lowertail)]
    mean_1 = df_attr3_con[i].mean()
    df_attr3_con.loc[(df_attr3_con[i]>uppertail) | (df_attr3_con[i]<lowertail),i]=mean_1
    
In [ ]:
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_attr3_con.columns):
    if df_attr3_con[i].dtypes == 'int64' or df_attr3_con[i].dtypes == 'float64':
        plt.subplot(9,3,x1+1)
        sns.boxplot(df_attr3_con[i])

Checking Skewness¶

In [ ]:
df_attr3.skew().sort_values()
Out[ ]:
WorkLifeBalance            -0.552480
JobInvolvement             -0.498419
JobSatisfaction            -0.329672
EnvironmentSatisfaction    -0.321654
RelationshipSatisfaction   -0.302828
Education                  -0.289681
HourlyRate                 -0.032311
DailyRate                  -0.003519
EmployeeCount               0.000000
StandardHours               0.000000
EmployeeNumber              0.016574
MonthlyRate                 0.018578
Age                         0.413286
TrainingTimesLastYear       0.553124
PercentSalaryHike           0.821128
YearsWithCurrManager        0.833451
YearsInCurrentRole          0.917363
DistanceFromHome            0.958118
StockOptionLevel            0.968980
JobLevel                    1.025401
NumCompaniesWorked          1.026471
TotalWorkingYears           1.117172
MonthlyIncome               1.369817
YearsAtCompany              1.764529
Attrition                   1.844366
PerformanceRating           1.921883
YearsSinceLastPromotion     1.984290
dtype: float64

Checking Correlation¶

In [ ]:
df_attr3.corr()
Out[ ]:
Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
Attrition 1.000000 -0.159205 -0.056652 0.077924 -0.031373 NaN -0.010577 -0.103369 -0.006846 -0.130016 -0.169105 -0.103481 -0.159840 0.015170 0.043494 -0.013478 0.002889 -0.045872 NaN -0.137145 -0.171063 -0.059478 -0.063939 -0.134392 -0.160545 -0.033019 -0.156199
Age -0.159205 1.000000 0.010661 -0.001686 0.208034 NaN -0.010145 0.010146 0.024287 0.029820 0.509604 -0.004892 0.497855 0.028051 0.299635 0.003634 0.001904 0.053535 NaN 0.037510 0.680381 -0.019621 -0.021490 0.311309 0.212901 0.216513 0.202089
DailyRate -0.056652 0.010661 1.000000 -0.004985 -0.016806 NaN -0.050990 0.018355 0.023381 0.046135 0.002966 0.030571 0.007707 -0.032182 0.038153 0.022704 0.000473 0.007846 NaN 0.042143 0.014515 0.002453 -0.037848 -0.034055 0.009932 -0.033229 -0.026363
DistanceFromHome 0.077924 -0.001686 -0.004985 1.000000 0.021042 NaN 0.032916 -0.016075 0.031131 0.008783 0.005303 -0.003669 -0.017014 0.027473 -0.029251 0.040235 0.027110 0.006557 NaN 0.044872 0.004628 -0.036942 -0.026556 0.009508 0.018845 0.010029 0.014406
Education -0.031373 0.208034 -0.016806 0.021042 1.000000 NaN 0.042070 -0.027128 0.016775 0.042438 0.101589 -0.011296 0.094961 -0.026084 0.126317 -0.011111 -0.024539 -0.009118 NaN 0.018422 0.148280 -0.025100 0.009819 0.069114 0.060236 0.054254 0.069065
EmployeeCount NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
EmployeeNumber -0.010577 -0.010145 -0.050990 0.032916 0.042070 NaN 1.000000 0.017621 0.035179 -0.006888 -0.018519 -0.046247 -0.014829 0.012648 -0.001251 -0.012944 -0.020359 -0.069861 NaN 0.062227 -0.014365 0.023603 0.010309 -0.011240 -0.008416 -0.009019 -0.009197
EnvironmentSatisfaction -0.103369 0.010146 0.018355 -0.016075 -0.027128 NaN 0.017621 1.000000 -0.049857 -0.008278 0.001212 -0.006784 -0.006259 0.037600 0.012594 -0.031701 -0.029548 0.007665 NaN 0.003432 -0.002693 -0.019359 0.027627 0.001458 0.018007 0.016194 -0.004999
HourlyRate -0.006846 0.024287 0.023381 0.031131 0.016775 NaN 0.035179 -0.049857 1.000000 0.042861 -0.027853 -0.071335 -0.015794 -0.015297 0.022157 -0.009062 -0.002172 0.001330 NaN 0.050263 -0.002334 -0.008548 -0.004607 -0.019582 -0.024106 -0.026716 -0.020123
JobInvolvement -0.130016 0.029820 0.046135 0.008783 0.042438 NaN -0.006888 -0.008278 0.042861 1.000000 -0.012630 -0.021476 -0.015271 -0.016322 0.015012 -0.017205 -0.029071 0.034297 NaN 0.021523 -0.005533 -0.015338 -0.014617 -0.021355 0.008717 -0.024184 0.025976
JobLevel -0.169105 0.509604 0.002966 0.005303 0.101589 NaN -0.018519 0.001212 -0.027853 -0.012630 1.000000 -0.001944 0.950300 0.039563 0.142501 -0.034730 -0.021222 0.021642 NaN 0.013984 0.782208 -0.018191 0.037818 0.534739 0.389447 0.353885 0.375281
JobSatisfaction -0.103481 -0.004892 0.030571 -0.003669 -0.011296 NaN -0.046247 -0.006784 -0.071335 -0.021476 -0.001944 1.000000 -0.007157 0.000644 -0.055699 0.020002 0.002297 -0.012454 NaN 0.010690 -0.020185 -0.005779 -0.019459 -0.003803 -0.002305 -0.018214 -0.027656
MonthlyIncome -0.159840 0.497855 0.007707 -0.017014 0.094961 NaN -0.014829 -0.006259 -0.015794 -0.015271 0.950300 -0.007157 1.000000 0.034814 0.149515 -0.027269 -0.017120 0.025873 NaN 0.005408 0.772893 -0.021736 0.030683 0.514285 0.363818 0.344978 0.344079
MonthlyRate 0.015170 0.028051 -0.032182 0.027473 -0.026084 NaN 0.012648 0.037600 -0.015297 -0.016322 0.039563 0.000644 0.034814 1.000000 0.017521 -0.006429 -0.009811 -0.004085 NaN -0.034323 0.026442 0.001467 0.007963 -0.023655 -0.012815 0.001567 -0.036746
NumCompaniesWorked 0.043494 0.299635 0.038153 -0.029251 0.126317 NaN -0.001251 0.012594 0.022157 0.015012 0.142501 -0.055699 0.149515 0.017521 1.000000 -0.010238 -0.014095 0.052733 NaN 0.030075 0.237639 -0.066054 -0.008366 -0.118421 -0.090754 -0.036814 -0.110319
PercentSalaryHike -0.013478 0.003634 0.022704 0.040235 -0.011111 NaN -0.012944 -0.031701 -0.009062 -0.017205 -0.034730 0.020002 -0.027269 -0.006429 -0.010238 1.000000 0.773550 -0.040490 NaN 0.007528 -0.020608 -0.005221 -0.003280 -0.035991 -0.001520 -0.022154 -0.011985
PerformanceRating 0.002889 0.001904 0.000473 0.027110 -0.024539 NaN -0.020359 -0.029548 -0.002172 -0.029071 -0.021222 0.002297 -0.017120 -0.009811 -0.014095 0.773550 1.000000 -0.031351 NaN 0.003506 0.006744 -0.015579 0.002572 0.003435 0.034986 0.017896 0.022827
RelationshipSatisfaction -0.045872 0.053535 0.007846 0.006557 -0.009118 NaN -0.069861 0.007665 0.001330 0.034297 0.021642 -0.012454 0.025873 -0.004085 0.052733 -0.040490 -0.031351 1.000000 NaN -0.045952 0.024054 0.002497 0.019604 0.019367 -0.015123 0.033493 -0.000867
StandardHours NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
StockOptionLevel -0.137145 0.037510 0.042143 0.044872 0.018422 NaN 0.062227 0.003432 0.050263 0.021523 0.013984 0.010690 0.005408 -0.034323 0.030075 0.007528 0.003506 -0.045952 NaN 1.000000 0.010136 0.011274 0.004129 0.015058 0.050818 0.014352 0.024698
TotalWorkingYears -0.171063 0.680381 0.014515 0.004628 0.148280 NaN -0.014365 -0.002693 -0.002334 -0.005533 0.782208 -0.020185 0.772893 0.026442 0.237639 -0.020608 0.006744 0.024054 NaN 0.010136 1.000000 -0.035662 0.001008 0.628133 0.460365 0.404858 0.459188
TrainingTimesLastYear -0.059478 -0.019621 0.002453 -0.036942 -0.025100 NaN 0.023603 -0.019359 -0.008548 -0.015338 -0.018191 -0.005779 -0.021736 0.001467 -0.066054 -0.005221 -0.015579 0.002497 NaN 0.011274 -0.035662 1.000000 0.028072 0.003569 -0.005738 -0.002067 -0.004096
WorkLifeBalance -0.063939 -0.021490 -0.037848 -0.026556 0.009819 NaN 0.010309 0.027627 -0.004607 -0.014617 0.037818 -0.019459 0.030683 0.007963 -0.008366 -0.003280 0.002572 0.019604 NaN 0.004129 0.001008 0.028072 1.000000 0.012089 0.049856 0.008941 0.002759
YearsAtCompany -0.134392 0.311309 -0.034055 0.009508 0.069114 NaN -0.011240 0.001458 -0.019582 -0.021355 0.534739 -0.003803 0.514285 -0.023655 -0.118421 -0.035991 0.003435 0.019367 NaN 0.015058 0.628133 0.003569 0.012089 1.000000 0.758754 0.618409 0.769212
YearsInCurrentRole -0.160545 0.212901 0.009932 0.018845 0.060236 NaN -0.008416 0.018007 -0.024106 0.008717 0.389447 -0.002305 0.363818 -0.012815 -0.090754 -0.001520 0.034986 -0.015123 NaN 0.050818 0.460365 -0.005738 0.049856 0.758754 1.000000 0.548056 0.714365
YearsSinceLastPromotion -0.033019 0.216513 -0.033229 0.010029 0.054254 NaN -0.009019 0.016194 -0.026716 -0.024184 0.353885 -0.018214 0.344978 0.001567 -0.036814 -0.022154 0.017896 0.033493 NaN 0.014352 0.404858 -0.002067 0.008941 0.618409 0.548056 1.000000 0.510224
YearsWithCurrManager -0.156199 0.202089 -0.026363 0.014406 0.069065 NaN -0.009197 -0.004999 -0.020123 0.025976 0.375281 -0.027656 0.344079 -0.036746 -0.110319 -0.011985 0.022827 -0.000867 NaN 0.024698 0.459188 -0.004096 0.002759 0.769212 0.714365 0.510224 1.000000
In [ ]:
df_dum_train3 = pd.get_dummies(df_attr3_cat)
In [ ]:
df_dum_train3.head()
Out[ ]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently BusinessTravel_Travel_Rarely Department_Human Resources Department_Research & Development Department_Sales EducationField_Human Resources EducationField_Life Sciences EducationField_Marketing EducationField_Medical EducationField_Other EducationField_Technical Degree Gender_Female Gender_Male JobRole_Healthcare Representative JobRole_Human Resources JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive JobRole_Sales Representative MaritalStatus_Divorced MaritalStatus_Married MaritalStatus_Single Over18_Y OverTime_No OverTime_Yes
0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1
1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0
2 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1
3 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1
4 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0

Standardization¶

In [ ]:
std_scaler = StandardScaler()
std_scaler1 = std_scaler.fit_transform(df_attr3_con)
In [ ]:
x3 = pd.DataFrame(std_scaler1,columns=df_attr3_con.columns)
In [ ]:
x3.head()
Out[ ]:
Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 2.280906 0.446350 0.742527 -1.010909 -0.891688 0.0 -1.701283 -0.660531 1.383138 0.379672 -0.057788 1.153254 0.129018 0.726020 2.529583 -1.150554 -0.426230 -1.584178 0.0 -1.018674 -0.374906 0.312607 -2.493820 0.041137 -0.018341 -0.795491 0.294570
1 -0.438422 1.322365 -1.297775 -0.147150 -1.868426 0.0 -1.699621 0.254625 -0.240677 -1.026167 -0.057788 -0.660853 -0.140791 1.488876 -0.672478 2.129306 2.346151 1.191438 0.0 0.510149 -0.057867 0.601911 0.338096 1.069787 0.882230 -0.277632 0.888852
2 2.280906 0.008343 1.414363 -0.887515 -0.891688 0.0 -1.696298 1.169781 1.284725 -1.026167 -0.961486 0.246200 -1.091220 -1.674841 1.614708 -0.057267 -0.426230 -0.658973 0.0 -1.018674 -0.533426 0.601911 0.338096 -1.501837 -1.219103 -0.795491 -1.191138
3 -0.438422 -0.429664 1.461466 -0.764121 1.061787 0.0 -1.694636 1.169781 -0.486709 0.379672 -0.961486 0.246200 -0.835167 1.243211 -0.672478 -1.150554 -0.426230 0.266233 0.0 -1.018674 -0.374906 0.601911 0.338096 0.555462 0.882230 0.758085 -1.191138
4 -0.438422 -1.086676 -0.524295 -0.887515 -1.868426 0.0 -1.691313 -1.575686 -1.274014 0.379672 -0.961486 -0.660853 -0.660400 0.325900 0.102053 -0.877232 -0.426230 1.191438 0.0 0.510149 -0.691946 0.601911 0.338096 -0.987512 -0.618722 0.240227 -0.596855
In [ ]:
train_merge3 = pd.concat([df_dum_train3,x3],axis=1)
In [ ]:
train_merge3.head()
Out[ ]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently BusinessTravel_Travel_Rarely Department_Human Resources Department_Research & Development Department_Sales EducationField_Human Resources EducationField_Life Sciences EducationField_Marketing EducationField_Medical EducationField_Other EducationField_Technical Degree Gender_Female Gender_Male JobRole_Healthcare Representative JobRole_Human Resources JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive JobRole_Sales Representative MaritalStatus_Divorced MaritalStatus_Married MaritalStatus_Single Over18_Y OverTime_No OverTime_Yes Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 2.280906 0.446350 0.742527 -1.010909 -0.891688 0.0 -1.701283 -0.660531 1.383138 0.379672 -0.057788 1.153254 0.129018 0.726020 2.529583 -1.150554 -0.426230 -1.584178 0.0 -1.018674 -0.374906 0.312607 -2.493820 0.041137 -0.018341 -0.795491 0.294570
1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 -0.438422 1.322365 -1.297775 -0.147150 -1.868426 0.0 -1.699621 0.254625 -0.240677 -1.026167 -0.057788 -0.660853 -0.140791 1.488876 -0.672478 2.129306 2.346151 1.191438 0.0 0.510149 -0.057867 0.601911 0.338096 1.069787 0.882230 -0.277632 0.888852
2 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 2.280906 0.008343 1.414363 -0.887515 -0.891688 0.0 -1.696298 1.169781 1.284725 -1.026167 -0.961486 0.246200 -1.091220 -1.674841 1.614708 -0.057267 -0.426230 -0.658973 0.0 -1.018674 -0.533426 0.601911 0.338096 -1.501837 -1.219103 -0.795491 -1.191138
3 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 -0.438422 -0.429664 1.461466 -0.764121 1.061787 0.0 -1.694636 1.169781 -0.486709 0.379672 -0.961486 0.246200 -0.835167 1.243211 -0.672478 -1.150554 -0.426230 0.266233 0.0 -1.018674 -0.374906 0.601911 0.338096 0.555462 0.882230 0.758085 -1.191138
4 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 -0.438422 -1.086676 -0.524295 -0.887515 -1.868426 0.0 -1.691313 -1.575686 -1.274014 0.379672 -0.961486 -0.660853 -0.660400 0.325900 0.102053 -0.877232 -0.426230 1.191438 0.0 0.510149 -0.691946 0.601911 0.338096 -0.987512 -0.618722 0.240227 -0.596855

Checking linearity¶

In [ ]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
In [ ]:
vif3 = pd.DataFrame()
vif3['Features']=train_merge3.columns
vif3['VIF'] = [variance_inflation_factor(train_merge3.values,i) for i in range(train_merge3.shape[1])]
vif3
Out[ ]:
Features VIF
0 BusinessTravel_Non-Travel inf
1 BusinessTravel_Travel_Frequently inf
2 BusinessTravel_Travel_Rarely inf
3 Department_Human Resources inf
4 Department_Research & Development inf
5 Department_Sales inf
6 EducationField_Human Resources inf
7 EducationField_Life Sciences inf
8 EducationField_Marketing inf
9 EducationField_Medical inf
10 EducationField_Other inf
11 EducationField_Technical Degree inf
12 Gender_Female inf
13 Gender_Male inf
14 JobRole_Healthcare Representative inf
15 JobRole_Human Resources inf
16 JobRole_Laboratory Technician inf
17 JobRole_Manager inf
18 JobRole_Manufacturing Director inf
19 JobRole_Research Director inf
20 JobRole_Research Scientist inf
21 JobRole_Sales Executive inf
22 JobRole_Sales Representative inf
23 MaritalStatus_Divorced inf
24 MaritalStatus_Married inf
25 MaritalStatus_Single inf
26 Over18_Y 0.000000
27 OverTime_No inf
28 OverTime_Yes inf
29 Attrition 1.347585
30 Age 1.758788
31 DailyRate 1.038182
32 DistanceFromHome 1.038291
33 Education 1.087140
34 EmployeeCount NaN
35 EmployeeNumber 1.035390
36 EnvironmentSatisfaction 1.060889
37 HourlyRate 1.032229
38 JobInvolvement 1.042954
39 JobLevel 6.346295
40 JobSatisfaction 1.046714
41 MonthlyIncome 2.667871
42 MonthlyRate 1.025837
43 NumCompaniesWorked 1.282556
44 PercentSalaryHike 2.572648
45 PerformanceRating 2.550537
46 RelationshipSatisfaction 1.037069
47 StandardHours NaN
48 StockOptionLevel 2.185174
49 TotalWorkingYears 2.756983
50 TrainingTimesLastYear 1.030164
51 WorkLifeBalance 1.032552
52 YearsAtCompany 4.548332
53 YearsInCurrentRole 3.380085
54 YearsSinceLastPromotion 1.329397
55 YearsWithCurrManager 2.937798
In [ ]:
featurestodrop = vif3.loc[vif3['VIF']>10]
droplist = featurestodrop['Features']
droplist = list(droplist)
len(droplist)
print(droplist)
['BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely', 'Department_Human Resources', 'Department_Research & Development', 'Department_Sales', 'EducationField_Human Resources', 'EducationField_Life Sciences', 'EducationField_Marketing', 'EducationField_Medical', 'EducationField_Other', 'EducationField_Technical Degree', 'Gender_Female', 'Gender_Male', 'JobRole_Healthcare Representative', 'JobRole_Human Resources', 'JobRole_Laboratory Technician', 'JobRole_Manager', 'JobRole_Manufacturing Director', 'JobRole_Research Director', 'JobRole_Research Scientist', 'JobRole_Sales Executive', 'JobRole_Sales Representative', 'MaritalStatus_Divorced', 'MaritalStatus_Married', 'MaritalStatus_Single', 'OverTime_No', 'OverTime_Yes']
In [ ]:
drop_list3 = ['BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely', 
              'Department_Human Resources', 'Department_Research & Development', 'Department_Sales', 
              'EducationField_Human Resources', 'EducationField_Life Sciences', 'EducationField_Marketing', 
              'EducationField_Medical', 'EducationField_Other', 'EducationField_Technical Degree', 'Gender_Female', 
              'Gender_Male', 'JobRole_Healthcare Representative', 'JobRole_Human Resources', 
              'JobRole_Laboratory Technician', 'JobRole_Manager','JobRole_Manufacturing Director', 
              'JobRole_Research Director','JobRole_Research Scientist', 'JobRole_Sales Executive', 
              'JobRole_Sales Representative', 'MaritalStatus_Divorced', 'MaritalStatus_Married', 'MaritalStatus_Single', 
              'OverTime_No', 'OverTime_Yes']
In [ ]:
df_final3 = train_merge3.drop(drop_list3,axis=1)
In [ ]:
df_final3.head()
Out[ ]:
Over18_Y Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 2.280906 0.446350 0.742527 -1.010909 -0.891688 0.0 -1.701283 -0.660531 1.383138 0.379672 -0.057788 1.153254 0.129018 0.726020 2.529583 -1.150554 -0.426230 -1.584178 0.0 -1.018674 -0.374906 0.312607 -2.493820 0.041137 -0.018341 -0.795491 0.294570
1 1 -0.438422 1.322365 -1.297775 -0.147150 -1.868426 0.0 -1.699621 0.254625 -0.240677 -1.026167 -0.057788 -0.660853 -0.140791 1.488876 -0.672478 2.129306 2.346151 1.191438 0.0 0.510149 -0.057867 0.601911 0.338096 1.069787 0.882230 -0.277632 0.888852
2 1 2.280906 0.008343 1.414363 -0.887515 -0.891688 0.0 -1.696298 1.169781 1.284725 -1.026167 -0.961486 0.246200 -1.091220 -1.674841 1.614708 -0.057267 -0.426230 -0.658973 0.0 -1.018674 -0.533426 0.601911 0.338096 -1.501837 -1.219103 -0.795491 -1.191138
3 1 -0.438422 -0.429664 1.461466 -0.764121 1.061787 0.0 -1.694636 1.169781 -0.486709 0.379672 -0.961486 0.246200 -0.835167 1.243211 -0.672478 -1.150554 -0.426230 0.266233 0.0 -1.018674 -0.374906 0.601911 0.338096 0.555462 0.882230 0.758085 -1.191138
4 1 -0.438422 -1.086676 -0.524295 -0.887515 -1.868426 0.0 -1.691313 -1.575686 -1.274014 0.379672 -0.961486 -0.660853 -0.660400 0.325900 0.102053 -0.877232 -0.426230 1.191438 0.0 0.510149 -0.691946 0.601911 0.338096 -0.987512 -0.618722 0.240227 -0.596855

Ckeaking Skewness¶

In [ ]:
train_merge3.skew().sort_values()
Out[ ]:
OverTime_No                         -0.964489
BusinessTravel_Travel_Rarely        -0.923992
Department_Research & Development   -0.646936
WorkLifeBalance                     -0.552480
JobInvolvement                      -0.498419
Gender_Male                         -0.408665
JobSatisfaction                     -0.329672
EnvironmentSatisfaction             -0.321654
RelationshipSatisfaction            -0.302828
Education                           -0.289681
HourlyRate                          -0.032311
DailyRate                           -0.003519
TrainingTimesLastYear               -0.001942
StandardHours                        0.000000
Over18_Y                             0.000000
EmployeeCount                        0.000000
EmployeeNumber                       0.016574
MonthlyRate                          0.018578
MaritalStatus_Married                0.169484
EducationField_Life Sciences         0.356919
Gender_Female                        0.408665
Age                                  0.413286
StockOptionLevel                     0.512145
YearsWithCurrManager                 0.652676
YearsInCurrentRole                   0.686683
YearsAtCompany                       0.733775
MaritalStatus_Single                 0.773874
TotalWorkingYears                    0.779596
EducationField_Medical               0.794118
PercentSalaryHike                    0.821128
Department_Sales                     0.856158
DistanceFromHome                     0.958118
OverTime_Yes                         0.964489
NumCompaniesWorked                   0.987236
JobLevel                             1.025401
MonthlyIncome                        1.159989
MaritalStatus_Divorced               1.336093
JobRole_Sales Executive              1.340834
JobRole_Research Scientist           1.512214
YearsSinceLastPromotion              1.520114
BusinessTravel_Travel_Frequently     1.595067
JobRole_Laboratory Technician        1.701604
Attrition                            1.844366
PerformanceRating                    1.921883
EducationField_Marketing             2.525783
BusinessTravel_Non-Travel            2.632066
JobRole_Manufacturing Director       2.694844
EducationField_Technical Degree      2.872604
JobRole_Healthcare Representative    2.887251
JobRole_Manager                      3.392611
JobRole_Sales Representative         3.847192
EducationField_Other                 3.875119
JobRole_Research Director            3.932443
Department_Human Resources           4.518824
JobRole_Human Resources              5.035637
EducationField_Human Resources       7.181112
dtype: float64

Checking Correlation¶

In [ ]:
train_merge3.corr()['Attrition']
Out[ ]:
BusinessTravel_Non-Travel           -0.074457
BusinessTravel_Travel_Frequently     0.115143
BusinessTravel_Travel_Rarely        -0.049538
Department_Human Resources           0.016832
Department_Research & Development   -0.085293
Department_Sales                     0.080855
EducationField_Human Resources       0.036466
EducationField_Life Sciences        -0.032703
EducationField_Marketing             0.055781
EducationField_Medical              -0.046999
EducationField_Other                -0.017898
EducationField_Technical Degree      0.069355
Gender_Female                       -0.029453
Gender_Male                          0.029453
JobRole_Healthcare Representative   -0.078696
JobRole_Human Resources              0.036215
JobRole_Laboratory Technician        0.098290
JobRole_Manager                     -0.083316
JobRole_Manufacturing Director      -0.082994
JobRole_Research Director           -0.088870
JobRole_Research Scientist          -0.000360
JobRole_Sales Executive              0.019774
JobRole_Sales Representative         0.157234
MaritalStatus_Divorced              -0.087716
MaritalStatus_Married               -0.090984
MaritalStatus_Single                 0.175419
Over18_Y                                  NaN
OverTime_No                         -0.246118
OverTime_Yes                         0.246118
Attrition                            1.000000
Age                                 -0.159205
DailyRate                           -0.056652
DistanceFromHome                     0.077924
Education                           -0.031373
EmployeeCount                             NaN
EmployeeNumber                      -0.010577
EnvironmentSatisfaction             -0.103369
HourlyRate                          -0.006846
JobInvolvement                      -0.130016
JobLevel                            -0.169105
JobSatisfaction                     -0.103481
MonthlyIncome                       -0.146207
MonthlyRate                          0.015170
NumCompaniesWorked                   0.030383
PercentSalaryHike                   -0.013478
PerformanceRating                    0.002889
RelationshipSatisfaction            -0.045872
StandardHours                             NaN
StockOptionLevel                    -0.186680
TotalWorkingYears                   -0.183018
TrainingTimesLastYear                0.005146
WorkLifeBalance                     -0.063939
YearsAtCompany                      -0.172690
YearsInCurrentRole                  -0.164386
YearsSinceLastPromotion             -0.026458
YearsWithCurrManager                -0.150640
Name: Attrition, dtype: float64

4. Problem Statements¶

In [ ]:
# 4. Explore the data using Exploratory Data Analysis- for pairs of Y and all xs
In [ ]:
# Read the data

df_attr4 = pd.read_csv(r'G:\ETLHive Pune\Project\HR-Data Atrition Project\HR-Employee-Attrition-Table 1.csv')
df_attr4.head()
Out[ ]:
Attrition Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 2 Female 94 3 2 Sales Executive 4 Single 5993 19479 8 Y Yes 11 3 1 80 0 8 0 1 6 4 0 5
1 0 49 Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 3 Male 61 2 2 Research Scientist 2 Married 5130 24907 1 Y No 23 4 4 80 1 10 3 3 10 7 1 7
2 1 37 Travel_Rarely 1373 Research & Development 2 2 Other 1 4 4 Male 92 2 1 Laboratory Technician 3 Single 2090 2396 6 Y Yes 15 3 2 80 0 7 3 3 0 0 0 0
3 0 33 Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 4 Female 56 3 1 Research Scientist 3 Married 2909 23159 1 Y Yes 11 3 3 80 0 8 3 3 8 7 3 0
4 0 27 Travel_Rarely 591 Research & Development 2 1 Medical 1 7 1 Male 40 3 1 Laboratory Technician 2 Married 3468 16632 9 Y No 12 3 4 80 1 6 3 3 2 2 2 2
In [ ]:
# EDA
In [ ]:
# Seperate Categorical and continious columns
In [ ]:
cat = []
con = []
for i in df_attr4.columns:
    if (df_attr4[i].dtypes == 'object'):
        cat.append(i)
    else:
        con.append(i)
In [ ]:
df_train_cat4 = cat
df_train_cat4 = df_attr4[cat]
df_train_cat4
Out[ ]:
BusinessTravel Department EducationField Gender JobRole MaritalStatus Over18 OverTime
0 Travel_Rarely Sales Life Sciences Female Sales Executive Single Y Yes
1 Travel_Frequently Research & Development Life Sciences Male Research Scientist Married Y No
2 Travel_Rarely Research & Development Other Male Laboratory Technician Single Y Yes
3 Travel_Frequently Research & Development Life Sciences Female Research Scientist Married Y Yes
4 Travel_Rarely Research & Development Medical Male Laboratory Technician Married Y No
... ... ... ... ... ... ... ... ...
1465 Travel_Frequently Research & Development Medical Male Laboratory Technician Married Y No
1466 Travel_Rarely Research & Development Medical Male Healthcare Representative Married Y No
1467 Travel_Rarely Research & Development Life Sciences Male Manufacturing Director Married Y Yes
1468 Travel_Frequently Sales Medical Male Sales Executive Married Y No
1469 Travel_Rarely Research & Development Medical Male Laboratory Technician Married Y No

1470 rows × 8 columns

In [ ]:
df_train_con4 = con
df_train_con4 = df_attr4[con]
df_train_con4
Out[ ]:
Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 1102 1 2 1 1 2 94 3 2 4 5993 19479 8 11 3 1 80 0 8 0 1 6 4 0 5
1 0 49 279 8 1 1 2 3 61 2 2 2 5130 24907 1 23 4 4 80 1 10 3 3 10 7 1 7
2 1 37 1373 2 2 1 4 4 92 2 1 3 2090 2396 6 15 3 2 80 0 7 3 3 0 0 0 0
3 0 33 1392 3 4 1 5 4 56 3 1 3 2909 23159 1 11 3 3 80 0 8 3 3 8 7 3 0
4 0 27 591 2 1 1 7 1 40 3 1 2 3468 16632 9 12 3 4 80 1 6 3 3 2 2 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 0 36 884 23 2 1 2061 3 41 4 2 4 2571 12290 4 17 3 3 80 1 17 3 3 5 2 0 3
1466 0 39 613 6 1 1 2062 4 42 2 3 1 9991 21457 4 15 3 1 80 1 9 5 3 7 7 1 7
1467 0 27 155 4 3 1 2064 2 87 4 2 2 6142 5174 1 20 4 2 80 1 6 0 3 6 2 0 3
1468 0 49 1023 2 3 1 2065 4 63 2 2 2 5390 13243 2 14 3 4 80 0 17 3 2 9 6 0 8
1469 0 34 628 8 3 1 2068 2 82 4 2 3 4404 10228 2 12 3 1 80 0 6 3 4 4 3 1 2

1470 rows × 27 columns

In [ ]:
# Visulization of categorical columns
In [ ]:
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_train_cat4.columns):
    if df_train_cat4[i].dtypes == 'object':
        plt.subplot(3,3,x1+1)
        sns.countplot(x=df_train_cat4[i])
In [ ]:
# Visulisation of continous columns 
In [ ]:
plt.figure(figsize=(17,19))
for x1,i in enumerate(df_train_con4.columns):
    if df_train_con4[i].dtypes == 'int64' or df_train_con4[i].dtypes == 'float64':
        plt.subplot(9,3,x1+1)
        sns.boxplot(df_train_con4[i])

Removing Outliers¶

In [ ]:
for i in df_train_con4.columns:
    q1 = df_train_con4[i].quantile(0.25)
    q2 = df_train_con4[i].quantile(0.75)
    IQR = q3-q1
    uppertail = q3+1.5*IQR
    lowertail = q1-1.5*IQR
    df_train_con4.loc[(df_train_con4[i]>uppertail) | (df_train_con4[i]<lowertail)]
    mean_1 = df_train_con4[i].mean()
    df_train_con4.loc[(df_train_con4[i]>uppertail) | (df_train_con4[i]<lowertail),i]=mean_1
In [ ]:
plt.figure(figsize=(17,19))
for x1,i in enumerate(df_train_con4.columns):
    if df_train_con4[i].dtypes == 'int64' or df_train_con4[i].dtypes == 'float64':
        plt.subplot(9,3,x1+1)
        sns.boxplot(df_train_con4[i])

One hot Encoding¶

In [ ]:
df_dum_train4 = pd.get_dummies(df_train_cat4)
In [ ]:
df_dum_train4
Out[ ]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently BusinessTravel_Travel_Rarely Department_Human Resources Department_Research & Development Department_Sales EducationField_Human Resources EducationField_Life Sciences EducationField_Marketing EducationField_Medical EducationField_Other EducationField_Technical Degree Gender_Female Gender_Male JobRole_Healthcare Representative JobRole_Human Resources JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive JobRole_Sales Representative MaritalStatus_Divorced MaritalStatus_Married MaritalStatus_Single Over18_Y OverTime_No OverTime_Yes
0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1
1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0
2 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1
3 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1
4 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0
1466 0 0 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0
1467 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1
1468 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0
1469 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0

1470 rows × 29 columns

Standardization¶

In [ ]:
std_scaler = StandardScaler()
std_scaler1 = std_scaler.fit_transform(df_train_con4)
In [ ]:
x4 = pd.DataFrame(std_scaler1,columns=df_train_con4.columns)
In [ ]:
x4.head()
Out[ ]:
Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 2.280906 7.105427e-15 0.0 -1.412470 -0.891688 0.0 4.547474e-13 -0.660531 -2.842171e-14 0.379672 -0.057788 1.153254 0.0 1.818989e-12 2.125136 3.552714e-15 -0.426230 -1.584178 0.0 -0.932014 -0.905232 -2.171982 -2.493820 0.184086 -0.018341 -0.679146 0.294570
1 -0.438422 7.105427e-15 0.0 0.509209 -1.868426 0.0 4.547474e-13 0.254625 -2.842171e-14 -1.026167 -0.057788 -0.660853 0.0 1.818989e-12 -0.678049 3.552714e-15 2.346151 1.191438 0.0 0.241988 0.587285 0.155707 0.338096 1.440441 0.882230 -0.368715 0.888852
2 2.280906 7.105427e-15 0.0 -1.137945 -0.891688 0.0 4.547474e-13 1.169781 -2.842171e-14 -1.026167 -0.961486 0.246200 0.0 1.818989e-12 1.324226 3.552714e-15 -0.426230 -0.658973 0.0 -0.932014 -1.360324 0.155707 0.338096 -1.700445 -1.219103 -0.679146 -1.191138
3 -0.438422 7.105427e-15 0.0 -0.863419 1.061787 0.0 4.547474e-13 1.169781 -2.842171e-14 0.379672 -0.961486 0.246200 0.0 1.818989e-12 -0.678049 3.552714e-15 -0.426230 0.266233 0.0 -0.932014 -0.905232 0.155707 0.338096 0.812264 0.882230 0.252146 -1.191138
4 -0.438422 7.105427e-15 0.0 -1.137945 -1.868426 0.0 4.547474e-13 -1.575686 -2.842171e-14 0.379672 -0.961486 -0.660853 0.0 1.818989e-12 2.525591 3.552714e-15 -0.426230 1.191438 0.0 0.241988 -1.815417 0.155707 0.338096 -1.072268 -0.618722 -0.058285 -0.596855
In [ ]:
df_train_merge4 = pd.concat([df_dum_train4,x4],axis=1)
In [ ]:
df_train_merge4.head()
Out[ ]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently BusinessTravel_Travel_Rarely Department_Human Resources Department_Research & Development Department_Sales EducationField_Human Resources EducationField_Life Sciences EducationField_Marketing EducationField_Medical EducationField_Other EducationField_Technical Degree Gender_Female Gender_Male JobRole_Healthcare Representative JobRole_Human Resources JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive JobRole_Sales Representative MaritalStatus_Divorced MaritalStatus_Married MaritalStatus_Single Over18_Y OverTime_No OverTime_Yes Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 2.280906 7.105427e-15 0.0 -1.412470 -0.891688 0.0 4.547474e-13 -0.660531 -2.842171e-14 0.379672 -0.057788 1.153254 0.0 1.818989e-12 2.125136 3.552714e-15 -0.426230 -1.584178 0.0 -0.932014 -0.905232 -2.171982 -2.493820 0.184086 -0.018341 -0.679146 0.294570
1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 -0.438422 7.105427e-15 0.0 0.509209 -1.868426 0.0 4.547474e-13 0.254625 -2.842171e-14 -1.026167 -0.057788 -0.660853 0.0 1.818989e-12 -0.678049 3.552714e-15 2.346151 1.191438 0.0 0.241988 0.587285 0.155707 0.338096 1.440441 0.882230 -0.368715 0.888852
2 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 2.280906 7.105427e-15 0.0 -1.137945 -0.891688 0.0 4.547474e-13 1.169781 -2.842171e-14 -1.026167 -0.961486 0.246200 0.0 1.818989e-12 1.324226 3.552714e-15 -0.426230 -0.658973 0.0 -0.932014 -1.360324 0.155707 0.338096 -1.700445 -1.219103 -0.679146 -1.191138
3 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 -0.438422 7.105427e-15 0.0 -0.863419 1.061787 0.0 4.547474e-13 1.169781 -2.842171e-14 0.379672 -0.961486 0.246200 0.0 1.818989e-12 -0.678049 3.552714e-15 -0.426230 0.266233 0.0 -0.932014 -0.905232 0.155707 0.338096 0.812264 0.882230 0.252146 -1.191138
4 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 -0.438422 7.105427e-15 0.0 -1.137945 -1.868426 0.0 4.547474e-13 -1.575686 -2.842171e-14 0.379672 -0.961486 -0.660853 0.0 1.818989e-12 2.525591 3.552714e-15 -0.426230 1.191438 0.0 0.241988 -1.815417 0.155707 0.338096 -1.072268 -0.618722 -0.058285 -0.596855

Checking Correlation¶

In [ ]:
df_train_merge4.corr()
Out[ ]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently BusinessTravel_Travel_Rarely Department_Human Resources Department_Research & Development Department_Sales EducationField_Human Resources EducationField_Life Sciences EducationField_Marketing EducationField_Medical EducationField_Other EducationField_Technical Degree Gender_Female Gender_Male JobRole_Healthcare Representative JobRole_Human Resources JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive JobRole_Sales Representative MaritalStatus_Divorced MaritalStatus_Married MaritalStatus_Single Over18_Y OverTime_No OverTime_Yes Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
BusinessTravel_Non-Travel 1.000000 -0.162435 -0.526850 -0.004755 -0.005013 0.007283 0.020835 0.005311 -0.030567 0.012828 -0.013389 0.004171 -0.050461 0.050461 0.012878 -0.015890 0.009270 0.014078 -0.013536 -0.021431 -0.010116 0.031022 -0.033780 0.057455 -0.043635 -0.004622 NaN 0.037163 -0.037163 -0.074457 NaN NaN 0.008097 0.004524 NaN NaN 0.003568 NaN -0.045779 -0.007295 0.019802 NaN NaN 0.002718 NaN 0.018310 0.021132 NaN 0.028807 0.018476 -0.020746 0.005780 0.023331 0.014365 0.020815 0.029183
BusinessTravel_Travel_Frequently -0.162435 1.000000 -0.753092 -0.007485 0.003340 -0.000160 0.011818 0.031128 -0.016586 -0.005367 -0.011004 -0.023569 0.022015 -0.022015 0.008029 0.001896 0.010023 -0.042583 0.009783 -0.023579 -0.004461 -0.010175 0.055469 0.005779 -0.030785 0.027734 NaN -0.029392 0.029392 0.115143 NaN NaN -0.022222 -0.008292 NaN NaN -0.012624 NaN 0.004424 -0.021557 0.027117 NaN NaN -0.039718 NaN 0.016463 0.028500 NaN -0.016142 0.006153 0.006193 0.010199 0.021100 0.013334 0.023216 0.029774
BusinessTravel_Travel_Rarely -0.526850 -0.753092 1.000000 0.009618 0.000465 -0.004718 -0.024073 -0.030355 0.034668 -0.003930 0.018406 0.017521 0.014682 -0.014682 -0.015503 0.008962 -0.014815 0.027294 0.000598 0.034600 0.010588 -0.011920 -0.025257 -0.043287 0.055613 -0.020808 NaN 0.000539 -0.000539 -0.049538 NaN NaN 0.013743 0.004126 NaN NaN 0.008496 NaN 0.026714 0.023433 -0.036562 NaN NaN 0.032401 NaN -0.026390 -0.038640 NaN -0.005303 -0.017619 0.008498 -0.012640 -0.033732 -0.021064 -0.033877 -0.045106
Department_Human Resources -0.004755 -0.007485 0.009618 1.000000 -0.290754 -0.139650 0.646436 -0.068040 -0.073692 -0.049761 -0.007527 -0.019469 -0.035652 0.035652 -0.066186 0.904983 -0.097859 0.087615 -0.070000 -0.050765 -0.105352 -0.112959 -0.051764 0.016037 0.034767 -0.051443 NaN 0.006178 -0.006178 0.016832 NaN NaN -0.019777 0.011435 NaN NaN -0.007597 NaN 0.004789 -0.006157 -0.024068 NaN NaN 0.020618 NaN -0.006385 0.034583 NaN -0.004000 -0.007662 -0.040022 0.047763 -0.005295 -0.033121 -0.026931 -0.021503
Department_Research & Development -0.005013 0.003340 0.000465 -0.290754 1.000000 -0.906818 -0.187954 0.127321 -0.478520 0.183548 0.064751 0.038541 -0.015760 0.015760 0.227637 -0.263128 0.336570 -0.071356 0.240754 0.174596 0.362340 -0.733497 -0.336127 0.035158 -0.019997 -0.009990 NaN 0.003036 -0.003036 -0.085293 NaN NaN -0.026238 -0.018604 NaN NaN 0.027976 NaN 0.023187 -0.107830 -0.002798 NaN NaN 0.022237 NaN 0.032720 -0.004587 NaN 0.016927 -0.004442 -0.006819 -0.069922 -0.036307 -0.037461 -0.021497 -0.024626
Department_Sales 0.007283 -0.000160 -0.004718 -0.139650 -0.906818 1.000000 -0.090275 -0.101791 0.527691 -0.168034 -0.063695 -0.031309 0.032017 -0.032017 -0.206425 -0.126381 -0.305208 0.035248 -0.218320 -0.158327 -0.328576 0.808869 0.370667 -0.043451 0.005378 0.033002 NaN -0.005864 0.005864 0.080855 NaN NaN 0.035867 0.014215 NaN NaN -0.025606 NaN -0.026107 0.114307 0.013499 NaN NaN -0.032097 NaN -0.031050 -0.010489 NaN -0.015755 0.007973 0.024688 0.051320 0.039907 0.053360 0.034112 0.034959
EducationField_Human Resources 0.020835 0.011818 -0.024073 0.646436 -0.187954 -0.090275 1.000000 -0.114559 -0.047637 -0.092899 -0.033248 -0.042964 -0.028956 0.028956 -0.042785 0.549751 -0.063260 0.082271 -0.045251 -0.032816 -0.068103 -0.073020 -0.033462 0.012107 0.057339 -0.072051 NaN -0.004040 0.004040 0.036466 NaN NaN -0.005234 0.026479 NaN NaN -0.006898 NaN 0.002079 0.010409 -0.021467 NaN NaN 0.031007 NaN -0.016167 0.041105 NaN 0.021206 0.004348 -0.037664 -0.003967 -0.008196 -0.020759 -0.023700 -0.025943
EducationField_Life Sciences 0.005311 0.031128 -0.030355 -0.068040 0.127321 -0.101791 -0.114559 1.000000 -0.291660 -0.568774 -0.203560 -0.263050 -0.006770 0.006770 0.029084 -0.063119 0.044359 -0.011143 0.052023 0.018401 0.043729 -0.091122 -0.043208 -0.002672 -0.017866 0.021469 NaN 0.013787 -0.013787 -0.032703 NaN NaN -0.045394 0.013184 NaN NaN -0.024526 NaN 0.003228 -0.008431 0.052004 NaN NaN -0.006131 NaN 0.010853 -0.019973 NaN -0.017993 0.047855 -0.039018 -0.039728 -0.013111 0.004035 -0.002480 0.003488
EducationField_Marketing -0.030567 -0.016586 0.034668 -0.073692 -0.478520 0.527691 -0.047637 -0.291660 1.000000 -0.236514 -0.084647 -0.109385 0.024143 -0.024143 -0.108929 -0.066690 -0.161055 0.025577 -0.115206 -0.083548 -0.173387 0.457308 0.133065 -0.007212 0.018491 -0.013323 NaN -0.014607 0.014607 0.055781 NaN NaN 0.054723 0.072405 NaN NaN 0.000479 NaN -0.018657 0.092698 -0.023528 NaN NaN -0.018611 NaN -0.020918 -0.006580 NaN 0.022560 0.029466 -0.029046 0.018500 0.016770 0.037816 0.006219 0.022774
EducationField_Medical 0.012828 -0.005367 -0.003930 -0.049761 0.183548 -0.168034 -0.092899 -0.568774 -0.236514 1.000000 -0.165072 -0.213314 0.013146 -0.013146 0.034165 -0.042895 0.066262 -0.001128 0.035496 0.062898 0.039735 -0.133532 -0.051990 0.013316 -0.007139 -0.004249 NaN -0.002246 0.002246 -0.046999 NaN NaN 0.021864 -0.072335 NaN NaN -0.021299 NaN 0.017103 -0.014114 -0.022645 NaN NaN 0.024826 NaN 0.014868 0.030494 NaN 0.033750 -0.050973 0.070542 0.001641 0.009381 -0.020740 0.022665 -0.005065
EducationField_Other -0.013389 -0.011004 0.018406 -0.007527 0.064751 -0.063695 -0.033248 -0.203560 -0.084647 -0.165072 1.000000 -0.076343 -0.022992 0.022992 0.017609 0.001594 0.058759 -0.008046 -0.010820 -0.006044 0.005286 -0.036995 -0.033774 0.005411 -0.009171 0.004972 NaN -0.024970 0.024970 -0.017898 NaN NaN 0.027156 0.038043 NaN NaN 0.064602 NaN -0.011895 -0.016724 0.003380 NaN NaN -0.012870 NaN 0.011449 -0.020305 NaN -0.042100 -0.039133 -0.008151 0.031812 -0.022235 -0.006238 -0.039931 -0.015490
EducationField_Technical Degree 0.004171 -0.023569 0.017521 -0.019469 0.038541 -0.031309 -0.042964 -0.263050 -0.109385 -0.213314 -0.076343 1.000000 -0.003886 0.003886 0.018681 -0.008623 -0.026589 -0.038946 0.007817 -0.022905 0.076218 -0.058843 0.057185 -0.019243 0.002710 0.014265 NaN 0.017723 -0.017723 0.069355 NaN NaN -0.036177 -0.026742 NaN NaN 0.027713 NaN -0.004519 -0.054707 -0.019795 NaN NaN -0.013819 NaN -0.021729 -0.011044 NaN -0.024560 -0.002168 0.008289 0.021962 0.010803 0.000445 0.003853 0.002107
Gender_Female -0.050461 0.022015 0.014682 -0.035652 -0.015760 0.032017 -0.028956 -0.006770 0.024143 0.013146 -0.022992 -0.003886 1.000000 -1.000000 -0.006823 -0.036082 -0.067793 0.033880 0.065197 0.006121 -0.009745 0.005348 0.028877 -0.046076 0.007804 0.032752 NaN -0.041924 0.041924 -0.029453 NaN NaN -0.003885 0.016547 NaN NaN -0.000508 NaN -0.017960 0.039403 -0.033252 NaN NaN 0.039147 NaN 0.013859 -0.022868 NaN -0.012716 0.040533 0.038787 0.002753 0.026297 0.032762 0.026985 0.034464
Gender_Male 0.050461 -0.022015 -0.014682 0.035652 0.015760 -0.032017 0.028956 0.006770 -0.024143 -0.013146 0.022992 0.003886 -1.000000 1.000000 0.006823 0.036082 0.067793 -0.033880 -0.065197 -0.006121 0.009745 -0.005348 -0.028877 0.046076 -0.007804 -0.032752 NaN 0.041924 -0.041924 0.029453 NaN NaN 0.003885 -0.016547 NaN NaN 0.000508 NaN 0.017960 -0.039403 0.033252 NaN NaN -0.039147 NaN -0.013859 0.022868 NaN 0.012716 -0.040533 -0.038787 -0.002753 -0.026297 -0.032762 -0.026985 -0.034464
JobRole_Healthcare Representative 0.012878 0.008029 -0.015503 -0.066186 0.227637 -0.206425 -0.042785 0.029084 -0.108929 0.034165 0.017609 0.018681 -0.006823 0.006823 1.000000 -0.059898 -0.144652 -0.085409 -0.103472 -0.075038 -0.155727 -0.166971 -0.076515 0.027897 0.004913 -0.030126 NaN 0.000382 -0.000382 -0.078696 NaN NaN 0.024199 0.024270 NaN NaN 0.014090 NaN 0.001272 0.115704 0.016367 NaN NaN 0.026955 NaN -0.000928 -0.005090 NaN 0.014021 0.102770 -0.012432 -0.026101 0.080206 0.067537 0.075902 0.043270
JobRole_Human Resources -0.015890 0.001896 0.008962 0.904983 -0.263128 -0.126381 0.549751 -0.063119 -0.066690 -0.042895 0.001594 -0.008623 -0.036082 0.036082 -0.059898 1.000000 -0.088561 -0.052290 -0.063349 -0.045941 -0.095342 -0.102226 -0.046845 0.021541 0.030995 -0.052320 NaN 0.014026 -0.014026 0.036215 NaN NaN -0.026719 -0.005295 NaN NaN -0.022014 NaN -0.004952 -0.100922 -0.029681 NaN NaN 0.020578 NaN -0.010154 0.044169 NaN -0.009864 -0.032191 -0.035902 0.043887 -0.020259 -0.053260 -0.054603 -0.047546
JobRole_Laboratory Technician 0.009270 0.010023 -0.014815 -0.097859 0.336570 -0.305208 -0.063260 0.044359 -0.161055 0.066262 0.058759 -0.026589 -0.067793 0.067793 -0.144652 -0.088561 1.000000 -0.126280 -0.152987 -0.110947 -0.230248 -0.246873 -0.113130 -0.011224 -0.009233 0.019873 NaN 0.044774 -0.044774 0.098290 NaN NaN 0.014273 -0.063566 NaN NaN -0.001533 NaN -0.022724 -0.344608 -0.015710 NaN NaN -0.021121 NaN 0.010796 -0.010691 NaN 0.013386 -0.071452 0.053998 -0.028209 -0.125688 -0.119437 -0.110099 -0.117188
JobRole_Manager 0.014078 -0.042583 0.027294 0.087615 -0.071356 0.035248 0.082271 -0.011143 0.025577 -0.001128 -0.008046 -0.038946 0.033880 -0.033880 -0.085409 -0.052290 -0.126280 1.000000 -0.090330 -0.065508 -0.135949 -0.145765 -0.066797 0.001997 0.049982 -0.055176 NaN 0.011086 -0.011086 -0.083316 NaN NaN -0.066843 0.028453 NaN NaN 0.010730 NaN 0.017112 0.552744 -0.005620 NaN NaN 0.042125 NaN 0.032050 0.025638 NaN -0.015637 0.160364 0.003052 0.005137 0.076676 0.137535 0.224255 0.146055
JobRole_Manufacturing Director -0.013536 0.009783 0.000598 -0.070000 0.240754 -0.218320 -0.045251 0.052023 -0.115206 0.035496 -0.010820 0.007817 0.065197 -0.065197 -0.103472 -0.063349 -0.152987 -0.090330 1.000000 -0.079362 -0.164700 -0.176592 -0.080924 0.020543 0.002819 -0.021331 NaN 0.010302 -0.010302 -0.082994 NaN NaN -0.019004 -0.005290 NaN NaN 0.059178 NaN -0.021939 0.114896 -0.013747 NaN NaN 0.009580 NaN 0.029775 0.003640 NaN 0.007735 0.033861 -0.013987 0.002011 0.087510 0.075061 -0.007241 0.084649
JobRole_Research Director -0.021431 -0.023579 0.034600 -0.050765 0.174596 -0.158327 -0.032816 0.018401 -0.083548 0.062898 -0.006044 -0.022905 0.006121 -0.006121 -0.075038 -0.045941 -0.110947 -0.065508 -0.079362 1.000000 -0.119442 -0.128066 -0.058687 0.037524 0.008271 -0.042299 NaN -0.002400 0.002400 -0.088870 NaN NaN -0.003730 0.049694 NaN NaN -0.048689 NaN 0.015200 0.414319 -0.006217 NaN NaN 0.097925 NaN -0.035744 -0.005492 NaN 0.015807 0.140892 -0.004527 0.034403 0.078271 0.075569 0.074455 0.116442
JobRole_Research Scientist -0.010116 -0.004461 0.010588 -0.105352 0.362340 -0.328576 -0.068103 0.043729 -0.173387 0.039735 0.005286 0.076218 -0.009745 0.009745 -0.155727 -0.095342 -0.230248 -0.135949 -0.164700 -0.119442 1.000000 -0.265775 -0.121792 -0.012115 -0.039987 0.053522 NaN -0.054378 0.054378 -0.000360 NaN NaN -0.021396 0.000709 NaN NaN 0.001940 NaN 0.047604 -0.387788 0.020503 NaN NaN -0.043981 NaN 0.019416 -0.003116 NaN -0.011635 -0.169943 -0.052126 -0.058613 -0.115012 -0.123154 -0.105237 -0.124838
JobRole_Sales Executive 0.031022 -0.010175 -0.011920 -0.112959 -0.733497 0.808869 -0.073020 -0.091122 0.457308 -0.133532 -0.036995 -0.058843 0.005348 -0.005348 -0.166971 -0.102226 -0.246873 -0.145765 -0.176592 -0.128066 -0.265775 1.000000 -0.130586 -0.013853 0.005751 0.006210 NaN -0.006341 0.006341 0.019774 NaN NaN 0.055332 0.053398 NaN NaN -0.024421 NaN -0.011413 0.127490 0.012604 NaN NaN 0.005913 NaN -0.041401 -0.004836 NaN 0.015756 -0.050006 0.013241 0.032092 0.133917 0.109504 0.049202 0.099893
JobRole_Sales Representative -0.033780 0.055469 -0.025257 -0.051764 -0.336127 0.370667 -0.033462 -0.043208 0.133065 -0.051990 -0.033774 0.057185 0.028877 -0.028877 -0.076515 -0.046845 -0.113130 -0.066797 -0.080924 -0.058687 -0.121792 -0.130586 1.000000 -0.052890 -0.023659 0.072439 NaN -0.003347 0.003347 0.157234 NaN NaN 0.007154 -0.091465 NaN NaN 0.002949 NaN -0.027282 -0.216559 0.001413 NaN NaN -0.104494 NaN -0.006214 -0.024859 NaN -0.048067 0.041842 0.040377 0.045148 -0.191950 -0.150470 -0.085622 -0.170527
MaritalStatus_Divorced 0.057455 0.005779 -0.043287 0.016037 0.035158 -0.043451 0.012107 -0.002672 -0.007212 0.013316 0.005411 -0.019243 -0.046076 0.046076 0.027897 0.021541 -0.011224 0.001997 0.020543 0.037524 -0.012115 -0.013853 -0.052890 1.000000 -0.491506 -0.366691 NaN -0.023462 0.023462 -0.087716 NaN NaN 0.025673 -0.002439 NaN NaN 0.016439 NaN 0.016815 0.037087 -0.015197 NaN NaN 0.040824 NaN -0.010310 0.006199 NaN 0.446285 0.014843 0.008405 -0.009080 0.011309 0.023047 -0.005279 0.015815
MaritalStatus_Married -0.043635 -0.030785 0.055613 0.034767 -0.019997 0.005378 0.057339 -0.017866 0.018491 -0.007139 -0.009171 0.002710 0.007804 -0.007804 0.004913 0.030995 -0.009233 0.049982 0.002819 0.008271 -0.039987 0.005751 -0.023659 -0.491506 1.000000 -0.629981 NaN 0.013502 -0.013502 -0.090984 NaN NaN 0.002933 -0.001865 NaN NaN -0.022180 NaN 0.028324 0.050547 -0.010315 NaN NaN -0.016142 NaN 0.009585 -0.043382 NaN 0.225574 -0.005217 -0.029602 -0.006388 0.066529 0.055687 0.054102 0.036885
MaritalStatus_Single -0.004622 0.027734 -0.020808 -0.051443 -0.009990 0.033002 -0.072051 0.021469 -0.013323 -0.004249 0.004972 0.014265 0.032752 -0.032752 -0.030126 -0.052320 0.019873 -0.055176 -0.021331 -0.042299 0.053522 0.006210 0.072439 -0.366691 -0.629981 1.000000 NaN 0.006498 -0.006498 0.175419 NaN NaN -0.026027 0.004168 NaN NaN 0.009035 NaN -0.045253 -0.087072 0.024571 NaN NaN -0.019161 NaN -0.001045 0.040817 NaN -0.638957 -0.007663 0.024129 0.014921 -0.081157 -0.080043 -0.053090 -0.053507
Over18_Y NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
OverTime_No 0.037163 -0.029392 0.000539 0.006178 0.003036 -0.005864 -0.004040 0.013787 -0.014607 -0.002246 -0.024970 0.017723 -0.041924 0.041924 0.000382 0.014026 0.044774 0.011086 0.010302 -0.002400 -0.054378 -0.006341 -0.003347 -0.023462 0.013502 0.006498 NaN 1.000000 -1.000000 -0.246118 NaN NaN -0.038315 0.020322 NaN NaN -0.070132 NaN 0.003507 -0.000544 -0.024539 NaN NaN 0.020786 NaN -0.004369 -0.048493 NaN 0.000449 -0.003604 0.079113 0.027092 0.048276 0.010861 0.012239 0.033366
OverTime_Yes -0.037163 0.029392 -0.000539 -0.006178 -0.003036 0.005864 0.004040 -0.013787 0.014607 0.002246 0.024970 -0.017723 0.041924 -0.041924 -0.000382 -0.014026 -0.044774 -0.011086 -0.010302 0.002400 0.054378 0.006341 0.003347 0.023462 -0.013502 -0.006498 NaN -1.000000 1.000000 0.246118 NaN NaN 0.038315 -0.020322 NaN NaN 0.070132 NaN -0.003507 0.000544 0.024539 NaN NaN -0.020786 NaN 0.004369 0.048493 NaN -0.000449 0.003604 -0.079113 -0.027092 -0.048276 -0.010861 -0.012239 -0.033366
Attrition -0.074457 0.115143 -0.049538 0.016832 -0.085293 0.080855 0.036466 -0.032703 0.055781 -0.046999 -0.017898 0.069355 -0.029453 0.029453 -0.078696 0.036215 0.098290 -0.083316 -0.082994 -0.088870 -0.000360 0.019774 0.157234 -0.087716 -0.090984 0.175419 NaN -0.246118 0.246118 1.000000 NaN NaN 0.073971 -0.031373 NaN NaN -0.103369 NaN -0.130016 -0.169105 -0.103481 NaN NaN 0.043494 NaN 0.002889 -0.045872 NaN -0.137145 -0.033917 -0.059478 -0.063939 -0.182422 -0.164386 -0.033019 -0.150640
Age NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
DailyRate NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
DistanceFromHome 0.008097 -0.022222 0.013743 -0.019777 -0.026238 0.035867 -0.005234 -0.045394 0.054723 0.021864 0.027156 -0.036177 -0.003885 0.003885 0.024199 -0.026719 0.014273 -0.066843 -0.019004 -0.003730 -0.021396 0.055332 0.007154 0.025673 0.002933 -0.026027 NaN -0.038315 0.038315 0.073971 NaN NaN 1.000000 0.030019 NaN NaN -0.001822 NaN 0.048667 -0.014518 -0.028497 NaN NaN -0.011595 NaN -0.000746 0.005234 NaN 0.021909 -0.003448 -0.006813 -0.023779 0.014902 0.005106 -0.013109 0.002100
Education 0.004524 -0.008292 0.004126 0.011435 -0.018604 0.014215 0.026479 0.013184 0.072405 -0.072335 0.038043 -0.026742 0.016547 -0.016547 0.024270 -0.005295 -0.063566 0.028453 -0.005290 0.049694 0.000709 0.053398 -0.091465 -0.002439 -0.001865 0.004168 NaN 0.020322 -0.020322 -0.031373 NaN NaN 0.030019 1.000000 NaN NaN -0.027128 NaN 0.042438 0.101589 -0.011296 NaN NaN 0.126317 NaN -0.024539 -0.009118 NaN 0.018422 0.062663 -0.025100 0.009819 0.047600 0.056703 0.054254 0.055449
EmployeeCount NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
EmployeeNumber NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
EnvironmentSatisfaction 0.003568 -0.012624 0.008496 -0.007597 0.027976 -0.025606 -0.006898 -0.024526 0.000479 -0.021299 0.064602 0.027713 -0.000508 0.000508 0.014090 -0.022014 -0.001533 0.010730 0.059178 -0.048689 0.001940 -0.024421 0.002949 0.016439 -0.022180 0.009035 NaN -0.070132 0.070132 -0.103369 NaN NaN -0.001822 -0.027128 NaN NaN 1.000000 NaN -0.008278 0.001212 -0.006784 NaN NaN 0.012594 NaN -0.029548 0.007665 NaN 0.003432 0.024419 -0.019359 0.027627 0.020692 0.040470 0.016194 -0.007923
HourlyRate NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
JobInvolvement -0.045779 0.004424 0.026714 0.004789 0.023187 -0.026107 0.002079 0.003228 -0.018657 0.017103 -0.011895 -0.004519 -0.017960 0.017960 0.001272 -0.004952 -0.022724 0.017112 -0.021939 0.015200 0.047604 -0.011413 -0.027282 0.016815 0.028324 -0.045253 NaN 0.003507 -0.003507 -0.130016 NaN NaN 0.048667 0.042438 NaN NaN -0.008278 NaN 1.000000 -0.012630 -0.021476 NaN NaN 0.015012 NaN -0.029071 0.034297 NaN 0.021523 -0.017304 -0.015338 -0.014617 0.033586 0.004117 -0.024184 0.025310
JobLevel -0.007295 -0.021557 0.023433 -0.006157 -0.107830 0.114307 0.010409 -0.008431 0.092698 -0.014114 -0.016724 -0.054707 0.039403 -0.039403 0.115704 -0.100922 -0.344608 0.552744 0.114896 0.414319 -0.387788 0.127490 -0.216559 0.037087 0.050547 -0.087072 NaN -0.000544 0.000544 -0.169105 NaN NaN -0.014518 0.101589 NaN NaN 0.001212 NaN -0.012630 1.000000 -0.001944 NaN NaN 0.142501 NaN -0.021222 0.021642 NaN 0.013984 0.356467 -0.018191 0.037818 0.306263 0.332018 0.353885 0.361762
JobSatisfaction 0.019802 0.027117 -0.036562 -0.024068 -0.002798 0.013499 -0.021467 0.052004 -0.023528 -0.022645 0.003380 -0.019795 -0.033252 0.033252 0.016367 -0.029681 -0.015710 -0.005620 -0.013747 -0.006217 0.020503 0.012604 0.001413 -0.015197 -0.010315 0.024571 NaN -0.024539 0.024539 -0.103481 NaN NaN -0.028497 -0.011296 NaN NaN -0.006784 NaN -0.021476 -0.001944 1.000000 NaN NaN -0.055699 NaN 0.002297 -0.012454 NaN 0.010690 0.002016 -0.005779 -0.019459 0.018744 0.007938 -0.018214 -0.019899
MonthlyIncome NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
MonthlyRate NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
NumCompaniesWorked 0.002718 -0.039718 0.032401 0.020618 0.022237 -0.032097 0.031007 -0.006131 -0.018611 0.024826 -0.012870 -0.013819 0.039147 -0.039147 0.026955 0.020578 -0.021121 0.042125 0.009580 0.097925 -0.043981 0.005913 -0.104494 0.040824 -0.016142 -0.019161 NaN 0.020786 -0.020786 0.043494 NaN NaN -0.011595 0.126317 NaN NaN 0.012594 NaN 0.015012 0.142501 -0.055699 NaN NaN 1.000000 NaN -0.014095 0.052733 NaN 0.030075 0.042451 -0.066054 -0.008366 -0.142141 -0.111122 -0.036814 -0.120770
PercentSalaryHike NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
PerformanceRating 0.018310 0.016463 -0.026390 -0.006385 0.032720 -0.031050 -0.016167 0.010853 -0.020918 0.014868 0.011449 -0.021729 0.013859 -0.013859 -0.000928 -0.010154 0.010796 0.032050 0.029775 -0.035744 0.019416 -0.041401 -0.006214 -0.010310 0.009585 -0.001045 NaN -0.004369 0.004369 0.002889 NaN NaN -0.000746 -0.024539 NaN NaN -0.029548 NaN -0.029071 -0.021222 0.002297 NaN NaN -0.014095 NaN 1.000000 -0.031351 NaN 0.003506 0.011048 -0.015579 0.002572 0.029423 0.034031 0.017896 0.025381
RelationshipSatisfaction 0.021132 0.028500 -0.038640 0.034583 -0.004587 -0.010489 0.041105 -0.019973 -0.006580 0.030494 -0.020305 -0.011044 -0.022868 0.022868 -0.005090 0.044169 -0.010691 0.025638 0.003640 -0.005492 -0.003116 -0.004836 -0.024859 0.006199 -0.043382 0.040817 NaN -0.048493 0.048493 -0.045872 NaN NaN 0.005234 -0.009118 NaN NaN 0.007665 NaN 0.034297 0.021642 -0.012454 NaN NaN 0.052733 NaN -0.031351 1.000000 NaN -0.045952 -0.004342 0.002497 0.019604 -0.021786 -0.021509 0.033493 -0.005533
StandardHours NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
StockOptionLevel 0.028807 -0.016142 -0.005303 -0.004000 0.016927 -0.015755 0.021206 -0.017993 0.022560 0.033750 -0.042100 -0.024560 -0.012716 0.012716 0.014021 -0.009864 0.013386 -0.015637 0.007735 0.015807 -0.011635 0.015756 -0.048067 0.446285 0.225574 -0.638957 NaN 0.000449 -0.000449 -0.137145 NaN NaN 0.021909 0.018422 NaN NaN 0.003432 NaN 0.021523 0.013984 0.010690 NaN NaN 0.030075 NaN 0.003506 -0.045952 NaN 1.000000 0.016672 0.011274 0.004129 0.057714 0.062072 0.014352 0.037675
TotalWorkingYears 0.018476 0.006153 -0.017619 -0.007662 -0.004442 0.007973 0.004348 0.047855 0.029466 -0.050973 -0.039133 -0.002168 0.040533 -0.040533 0.102770 -0.032191 -0.071452 0.160364 0.033861 0.140892 -0.169943 -0.050006 0.041842 0.014843 -0.005217 -0.007663 NaN -0.003604 0.003604 -0.033917 NaN NaN -0.003448 0.062663 NaN NaN 0.024419 NaN -0.017304 0.356467 0.002016 NaN NaN 0.042451 NaN 0.011048 -0.004342 NaN 0.016672 1.000000 0.009694 0.003894 0.198983 0.228646 0.216931 0.244631
TrainingTimesLastYear -0.020746 0.006193 0.008498 -0.040022 -0.006819 0.024688 -0.037664 -0.039018 -0.029046 0.070542 -0.008151 0.008289 0.038787 -0.038787 -0.012432 -0.035902 0.053998 0.003052 -0.013987 -0.004527 -0.052126 0.013241 0.040377 0.008405 -0.029602 0.024129 NaN 0.079113 -0.079113 -0.059478 NaN NaN -0.006813 -0.025100 NaN NaN -0.019359 NaN -0.015338 -0.018191 -0.005779 NaN NaN -0.066054 NaN -0.015579 0.002497 NaN 0.011274 0.009694 1.000000 0.028072 0.000134 -0.001871 -0.002067 -0.003578
WorkLifeBalance 0.005780 0.010199 -0.012640 0.047763 -0.069922 0.051320 -0.003967 -0.039728 0.018500 0.001641 0.031812 0.021962 0.002753 -0.002753 -0.026101 0.043887 -0.028209 0.005137 0.002011 0.034403 -0.058613 0.032092 0.045148 -0.009080 -0.006388 0.014921 NaN 0.027092 -0.027092 -0.063939 NaN NaN -0.023779 0.009819 NaN NaN 0.027627 NaN -0.014617 0.037818 -0.019459 NaN NaN -0.008366 NaN 0.002572 0.019604 NaN 0.004129 0.003894 0.028072 1.000000 0.006952 0.036562 0.008941 -0.005749
YearsAtCompany 0.023331 0.021100 -0.033732 -0.005295 -0.036307 0.039907 -0.008196 -0.013111 0.016770 0.009381 -0.022235 0.010803 0.026297 -0.026297 0.080206 -0.020259 -0.125688 0.076676 0.087510 0.078271 -0.115012 0.133917 -0.191950 0.011309 0.066529 -0.081157 NaN 0.048276 -0.048276 -0.182422 NaN NaN 0.014902 0.047600 NaN NaN 0.020692 NaN 0.033586 0.306263 0.018744 NaN NaN -0.142141 NaN 0.029423 -0.021786 NaN 0.057714 0.198983 0.000134 0.006952 1.000000 0.771709 0.421157 0.745037
YearsInCurrentRole 0.014365 0.013334 -0.021064 -0.033121 -0.037461 0.053360 -0.020759 0.004035 0.037816 -0.020740 -0.006238 0.000445 0.032762 -0.032762 0.067537 -0.053260 -0.119437 0.137535 0.075061 0.075569 -0.123154 0.109504 -0.150470 0.023047 0.055687 -0.080043 NaN 0.010861 -0.010861 -0.164386 NaN NaN 0.005106 0.056703 NaN NaN 0.040470 NaN 0.004117 0.332018 0.007938 NaN NaN -0.111122 NaN 0.034031 -0.021509 NaN 0.062072 0.228646 -0.001871 0.036562 0.771709 1.000000 0.501135 0.705200
YearsSinceLastPromotion 0.020815 0.023216 -0.033877 -0.026931 -0.021497 0.034112 -0.023700 -0.002480 0.006219 0.022665 -0.039931 0.003853 0.026985 -0.026985 0.075902 -0.054603 -0.110099 0.224255 -0.007241 0.074455 -0.105237 0.049202 -0.085622 -0.005279 0.054102 -0.053090 NaN 0.012239 -0.012239 -0.033019 NaN NaN -0.013109 0.054254 NaN NaN 0.016194 NaN -0.024184 0.353885 -0.018214 NaN NaN -0.036814 NaN 0.017896 0.033493 NaN 0.014352 0.216931 -0.002067 0.008941 0.421157 0.501135 1.000000 0.479890
YearsWithCurrManager 0.029183 0.029774 -0.045106 -0.021503 -0.024626 0.034959 -0.025943 0.003488 0.022774 -0.005065 -0.015490 0.002107 0.034464 -0.034464 0.043270 -0.047546 -0.117188 0.146055 0.084649 0.116442 -0.124838 0.099893 -0.170527 0.015815 0.036885 -0.053507 NaN 0.033366 -0.033366 -0.150640 NaN NaN 0.002100 0.055449 NaN NaN -0.007923 NaN 0.025310 0.361762 -0.019899 NaN NaN -0.120770 NaN 0.025381 -0.005533 NaN 0.037675 0.244631 -0.003578 -0.005749 0.745037 0.705200 0.479890 1.000000
In [ ]:
df_train_merge4.describe()
Out[ ]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently BusinessTravel_Travel_Rarely Department_Human Resources Department_Research & Development Department_Sales EducationField_Human Resources EducationField_Life Sciences EducationField_Marketing EducationField_Medical EducationField_Other EducationField_Technical Degree Gender_Female Gender_Male JobRole_Healthcare Representative JobRole_Human Resources JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive JobRole_Sales Representative MaritalStatus_Divorced MaritalStatus_Married MaritalStatus_Single Over18_Y OverTime_No OverTime_Yes Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
count 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.000000 1470.0 1470.000000 1470.000000 1.470000e+03 1.470000e+03 1470.0 1.470000e+03 1.470000e+03 1470.0 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1470.0 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1470.0 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03 1.470000e+03
mean 0.102041 0.188435 0.709524 0.042857 0.653741 0.303401 0.018367 0.412245 0.108163 0.315646 0.055782 0.089796 0.400000 0.600000 0.089116 0.035374 0.176190 0.069388 0.098639 0.054422 0.198639 0.221769 0.056463 0.222449 0.457823 0.319728 1.0 0.717007 0.282993 -7.106938e-17 7.105427e-15 0.0 -1.970457e-16 2.697011e-16 0.0 4.547474e-13 7.054070e-17 -2.842171e-14 -6.495182e-18 1.479542e-16 -7.816876e-18 0.0 1.818989e-12 -5.588878e-17 3.552714e-15 -7.766274e-16 -1.910792e-16 0.0 6.600918e-17 1.253721e-17 -1.106069e-16 -5.165936e-17 -1.540340e-16 -8.851574e-17 -4.236974e-17 -6.993650e-17
std 0.302805 0.391193 0.454137 0.202604 0.475939 0.459884 0.134321 0.492406 0.310692 0.464931 0.229579 0.285986 0.490065 0.490065 0.285008 0.184786 0.381112 0.254199 0.298279 0.226925 0.399112 0.415578 0.230891 0.416033 0.498387 0.466530 0.0 0.450606 0.450606 1.000340e+00 0.000000e+00 0.0 1.000340e+00 1.000340e+00 0.0 0.000000e+00 1.000340e+00 0.000000e+00 1.000340e+00 1.000340e+00 1.000340e+00 0.0 0.000000e+00 1.000340e+00 0.000000e+00 1.000340e+00 1.000340e+00 0.0 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00 1.000340e+00
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.0 0.000000 0.000000 -4.384223e-01 7.105427e-15 0.0 -1.412470e+00 -1.868426e+00 0.0 4.547474e-13 -1.575686e+00 -2.842171e-14 -2.432006e+00 -9.614864e-01 -1.567907e+00 0.0 1.818989e-12 -1.078504e+00 3.552714e-15 -4.262300e-01 -1.584178e+00 0.0 -9.320144e-01 -2.270509e+00 -2.171982e+00 -2.493820e+00 -1.700445e+00 -1.219103e+00 -6.791457e-01 -1.191138e+00
25% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.0 0.000000 0.000000 -4.384223e-01 7.105427e-15 0.0 -1.137945e+00 -8.916883e-01 0.0 4.547474e-13 -6.605307e-01 -2.842171e-14 -1.026167e+00 -9.614864e-01 -6.608532e-01 0.0 1.818989e-12 -6.780494e-01 3.552714e-15 -4.262300e-01 -6.589728e-01 0.0 -9.320144e-01 -9.052321e-01 -6.201892e-01 -1.077862e+00 -7.581791e-01 -6.187220e-01 -6.791457e-01 -5.968549e-01
50% 0.000000 0.000000 1.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.0 1.000000 0.000000 -4.384223e-01 7.105427e-15 0.0 2.346832e-01 8.504925e-02 0.0 4.547474e-13 2.546249e-01 -2.842171e-14 3.796721e-01 -5.778755e-02 2.462002e-01 0.0 1.818989e-12 -2.775943e-01 3.552714e-15 -4.262300e-01 2.662326e-01 0.0 2.419883e-01 5.872853e-01 1.557071e-01 3.380962e-01 -1.300020e-01 -3.185315e-01 -3.687153e-01 -2.997134e-01
75% 0.000000 0.000000 1.000000 0.000000 1.000000 1.000000 0.000000 1.000000 0.000000 1.000000 0.000000 0.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.000000 1.0 1.000000 1.000000 -4.384223e-01 7.105427e-15 0.0 8.365853e-01 1.061787e+00 0.0 4.547474e-13 1.169781e+00 -2.842171e-14 3.796721e-01 8.459113e-01 1.153254e+00 0.0 1.818989e-12 5.233157e-01 3.552714e-15 -4.262300e-01 1.191438e+00 0.0 2.419883e-01 5.872853e-01 1.557071e-01 3.380962e-01 5.007390e-01 8.822302e-01 2.521455e-01 8.888524e-01
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.0 1.000000 1.000000 2.280906e+00 7.105427e-15 0.0 2.156362e+00 2.038524e+00 0.0 4.547474e-13 1.169781e+00 -2.842171e-14 1.785511e+00 2.653309e+00 1.153254e+00 0.0 1.818989e-12 2.525591e+00 3.552714e-15 2.346151e+00 1.191438e+00 0.0 2.589994e+00 5.872853e-01 2.483396e+00 1.754054e+00 2.382706e+00 2.983563e+00 3.977310e+00 2.968843e+00

5. Problem Statements¶

In [ ]:
# 5. Visualize all the distribution relationship
In [ ]:
# 1. Seperate categorical columns
In [ ]:
cat = []
con = []
for i in df_attr.columns:
    if (df_attr[i].dtypes == 'object'):
        cat.append(i)
    else:
        con.append(i)
In [ ]:
df_train5_cat1 = cat
df_train5_cat1 = df_attr[cat]
df_train5_cat1 
Out[ ]:
BusinessTravel Department EducationField Gender JobRole MaritalStatus Over18 OverTime AGE_GROUP
0 Travel_Rarely Sales Life Sciences Female Sales Executive Single Y Yes GROUP2
1 Travel_Frequently Research & Development Life Sciences Male Research Scientist Married Y No GROUP3
2 Travel_Rarely Research & Development Other Male Laboratory Technician Single Y Yes GROUP2
3 Travel_Frequently Research & Development Life Sciences Female Research Scientist Married Y Yes GROUP2
4 Travel_Rarely Research & Development Medical Male Laboratory Technician Married Y No GROUP1
... ... ... ... ... ... ... ... ... ...
1465 Travel_Frequently Research & Development Medical Male Laboratory Technician Married Y No GROUP2
1466 Travel_Rarely Research & Development Medical Male Healthcare Representative Married Y No GROUP2
1467 Travel_Rarely Research & Development Life Sciences Male Manufacturing Director Married Y Yes GROUP1
1468 Travel_Frequently Sales Medical Male Sales Executive Married Y No GROUP3
1469 Travel_Rarely Research & Development Medical Male Laboratory Technician Married Y No GROUP2

1470 rows × 9 columns

In [ ]:
df_train5_con1 = con
df_train5_con1 = df_attr[con]
df_train5_con1
Out[ ]:
Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 1102 1 2 1 1 2 94 3 2 4 5993 19479 8 11 3 1 80 0 8 0 1 6 4 0 5
1 0 49 279 8 1 1 2 3 61 2 2 2 5130 24907 1 23 4 4 80 1 10 3 3 10 7 1 7
2 1 37 1373 2 2 1 4 4 92 2 1 3 2090 2396 6 15 3 2 80 0 7 3 3 0 0 0 0
3 0 33 1392 3 4 1 5 4 56 3 1 3 2909 23159 1 11 3 3 80 0 8 3 3 8 7 3 0
4 0 27 591 2 1 1 7 1 40 3 1 2 3468 16632 9 12 3 4 80 1 6 3 3 2 2 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 0 36 884 23 2 1 2061 3 41 4 2 4 2571 12290 4 17 3 3 80 1 17 3 3 5 2 0 3
1466 0 39 613 6 1 1 2062 4 42 2 3 1 9991 21457 4 15 3 1 80 1 9 5 3 7 7 1 7
1467 0 27 155 4 3 1 2064 2 87 4 2 2 6142 5174 1 20 4 2 80 1 6 0 3 6 2 0 3
1468 0 49 1023 2 3 1 2065 4 63 2 2 2 5390 13243 2 14 3 4 80 0 17 3 2 9 6 0 8
1469 0 34 628 8 3 1 2068 2 82 4 2 3 4404 10228 2 12 3 1 80 0 6 3 4 4 3 1 2

1470 rows × 27 columns

In [ ]:
# Visulasation of categorical columns
In [ ]:
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_train5_cat1.columns):
    if df_train5_cat1[i].dtypes=='object':
        plt.subplot(3,3,x1+1)
        sns.countplot(x=df_train5_cat1[i])
In [ ]:
# Visulisation of continious columns
In [ ]:
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_train5_con1.columns):
    if df_train5_con1[i].dtypes == 'int64' or df_train5_con1[i].dypes == 'float64':
        plt.subplot(9,3,x1+1)
        sns.distplot(df_train5_con1[i])

Checking Skewness¶

In [ ]:
df_attr.skew().sort_values()
Out[ ]:
WorkLifeBalance            -0.552480
JobInvolvement             -0.498419
JobSatisfaction            -0.329672
EnvironmentSatisfaction    -0.321654
RelationshipSatisfaction   -0.302828
Education                  -0.289681
HourlyRate                 -0.032311
DailyRate                  -0.003519
EmployeeCount               0.000000
StandardHours               0.000000
EmployeeNumber              0.016574
MonthlyRate                 0.018578
Age                         0.413286
TrainingTimesLastYear       0.553124
PercentSalaryHike           0.821128
YearsWithCurrManager        0.833451
YearsInCurrentRole          0.917363
DistanceFromHome            0.958118
StockOptionLevel            0.968980
JobLevel                    1.025401
NumCompaniesWorked          1.026471
TotalWorkingYears           1.117172
MonthlyIncome               1.369817
YearsAtCompany              1.764529
Attrition                   1.844366
PerformanceRating           1.921883
YearsSinceLastPromotion     1.984290
dtype: float64

Checking Correlation¶

In [ ]:
df_attr.corr()
Out[ ]:
Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome MonthlyRate NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
Attrition 1.000000 -0.159205 -0.056652 0.077924 -0.031373 NaN -0.010577 -0.103369 -0.006846 -0.130016 -0.169105 -0.103481 -0.159840 0.015170 0.043494 -0.013478 0.002889 -0.045872 NaN -0.137145 -0.171063 -0.059478 -0.063939 -0.134392 -0.160545 -0.033019 -0.156199
Age -0.159205 1.000000 0.010661 -0.001686 0.208034 NaN -0.010145 0.010146 0.024287 0.029820 0.509604 -0.004892 0.497855 0.028051 0.299635 0.003634 0.001904 0.053535 NaN 0.037510 0.680381 -0.019621 -0.021490 0.311309 0.212901 0.216513 0.202089
DailyRate -0.056652 0.010661 1.000000 -0.004985 -0.016806 NaN -0.050990 0.018355 0.023381 0.046135 0.002966 0.030571 0.007707 -0.032182 0.038153 0.022704 0.000473 0.007846 NaN 0.042143 0.014515 0.002453 -0.037848 -0.034055 0.009932 -0.033229 -0.026363
DistanceFromHome 0.077924 -0.001686 -0.004985 1.000000 0.021042 NaN 0.032916 -0.016075 0.031131 0.008783 0.005303 -0.003669 -0.017014 0.027473 -0.029251 0.040235 0.027110 0.006557 NaN 0.044872 0.004628 -0.036942 -0.026556 0.009508 0.018845 0.010029 0.014406
Education -0.031373 0.208034 -0.016806 0.021042 1.000000 NaN 0.042070 -0.027128 0.016775 0.042438 0.101589 -0.011296 0.094961 -0.026084 0.126317 -0.011111 -0.024539 -0.009118 NaN 0.018422 0.148280 -0.025100 0.009819 0.069114 0.060236 0.054254 0.069065
EmployeeCount NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
EmployeeNumber -0.010577 -0.010145 -0.050990 0.032916 0.042070 NaN 1.000000 0.017621 0.035179 -0.006888 -0.018519 -0.046247 -0.014829 0.012648 -0.001251 -0.012944 -0.020359 -0.069861 NaN 0.062227 -0.014365 0.023603 0.010309 -0.011240 -0.008416 -0.009019 -0.009197
EnvironmentSatisfaction -0.103369 0.010146 0.018355 -0.016075 -0.027128 NaN 0.017621 1.000000 -0.049857 -0.008278 0.001212 -0.006784 -0.006259 0.037600 0.012594 -0.031701 -0.029548 0.007665 NaN 0.003432 -0.002693 -0.019359 0.027627 0.001458 0.018007 0.016194 -0.004999
HourlyRate -0.006846 0.024287 0.023381 0.031131 0.016775 NaN 0.035179 -0.049857 1.000000 0.042861 -0.027853 -0.071335 -0.015794 -0.015297 0.022157 -0.009062 -0.002172 0.001330 NaN 0.050263 -0.002334 -0.008548 -0.004607 -0.019582 -0.024106 -0.026716 -0.020123
JobInvolvement -0.130016 0.029820 0.046135 0.008783 0.042438 NaN -0.006888 -0.008278 0.042861 1.000000 -0.012630 -0.021476 -0.015271 -0.016322 0.015012 -0.017205 -0.029071 0.034297 NaN 0.021523 -0.005533 -0.015338 -0.014617 -0.021355 0.008717 -0.024184 0.025976
JobLevel -0.169105 0.509604 0.002966 0.005303 0.101589 NaN -0.018519 0.001212 -0.027853 -0.012630 1.000000 -0.001944 0.950300 0.039563 0.142501 -0.034730 -0.021222 0.021642 NaN 0.013984 0.782208 -0.018191 0.037818 0.534739 0.389447 0.353885 0.375281
JobSatisfaction -0.103481 -0.004892 0.030571 -0.003669 -0.011296 NaN -0.046247 -0.006784 -0.071335 -0.021476 -0.001944 1.000000 -0.007157 0.000644 -0.055699 0.020002 0.002297 -0.012454 NaN 0.010690 -0.020185 -0.005779 -0.019459 -0.003803 -0.002305 -0.018214 -0.027656
MonthlyIncome -0.159840 0.497855 0.007707 -0.017014 0.094961 NaN -0.014829 -0.006259 -0.015794 -0.015271 0.950300 -0.007157 1.000000 0.034814 0.149515 -0.027269 -0.017120 0.025873 NaN 0.005408 0.772893 -0.021736 0.030683 0.514285 0.363818 0.344978 0.344079
MonthlyRate 0.015170 0.028051 -0.032182 0.027473 -0.026084 NaN 0.012648 0.037600 -0.015297 -0.016322 0.039563 0.000644 0.034814 1.000000 0.017521 -0.006429 -0.009811 -0.004085 NaN -0.034323 0.026442 0.001467 0.007963 -0.023655 -0.012815 0.001567 -0.036746
NumCompaniesWorked 0.043494 0.299635 0.038153 -0.029251 0.126317 NaN -0.001251 0.012594 0.022157 0.015012 0.142501 -0.055699 0.149515 0.017521 1.000000 -0.010238 -0.014095 0.052733 NaN 0.030075 0.237639 -0.066054 -0.008366 -0.118421 -0.090754 -0.036814 -0.110319
PercentSalaryHike -0.013478 0.003634 0.022704 0.040235 -0.011111 NaN -0.012944 -0.031701 -0.009062 -0.017205 -0.034730 0.020002 -0.027269 -0.006429 -0.010238 1.000000 0.773550 -0.040490 NaN 0.007528 -0.020608 -0.005221 -0.003280 -0.035991 -0.001520 -0.022154 -0.011985
PerformanceRating 0.002889 0.001904 0.000473 0.027110 -0.024539 NaN -0.020359 -0.029548 -0.002172 -0.029071 -0.021222 0.002297 -0.017120 -0.009811 -0.014095 0.773550 1.000000 -0.031351 NaN 0.003506 0.006744 -0.015579 0.002572 0.003435 0.034986 0.017896 0.022827
RelationshipSatisfaction -0.045872 0.053535 0.007846 0.006557 -0.009118 NaN -0.069861 0.007665 0.001330 0.034297 0.021642 -0.012454 0.025873 -0.004085 0.052733 -0.040490 -0.031351 1.000000 NaN -0.045952 0.024054 0.002497 0.019604 0.019367 -0.015123 0.033493 -0.000867
StandardHours NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
StockOptionLevel -0.137145 0.037510 0.042143 0.044872 0.018422 NaN 0.062227 0.003432 0.050263 0.021523 0.013984 0.010690 0.005408 -0.034323 0.030075 0.007528 0.003506 -0.045952 NaN 1.000000 0.010136 0.011274 0.004129 0.015058 0.050818 0.014352 0.024698
TotalWorkingYears -0.171063 0.680381 0.014515 0.004628 0.148280 NaN -0.014365 -0.002693 -0.002334 -0.005533 0.782208 -0.020185 0.772893 0.026442 0.237639 -0.020608 0.006744 0.024054 NaN 0.010136 1.000000 -0.035662 0.001008 0.628133 0.460365 0.404858 0.459188
TrainingTimesLastYear -0.059478 -0.019621 0.002453 -0.036942 -0.025100 NaN 0.023603 -0.019359 -0.008548 -0.015338 -0.018191 -0.005779 -0.021736 0.001467 -0.066054 -0.005221 -0.015579 0.002497 NaN 0.011274 -0.035662 1.000000 0.028072 0.003569 -0.005738 -0.002067 -0.004096
WorkLifeBalance -0.063939 -0.021490 -0.037848 -0.026556 0.009819 NaN 0.010309 0.027627 -0.004607 -0.014617 0.037818 -0.019459 0.030683 0.007963 -0.008366 -0.003280 0.002572 0.019604 NaN 0.004129 0.001008 0.028072 1.000000 0.012089 0.049856 0.008941 0.002759
YearsAtCompany -0.134392 0.311309 -0.034055 0.009508 0.069114 NaN -0.011240 0.001458 -0.019582 -0.021355 0.534739 -0.003803 0.514285 -0.023655 -0.118421 -0.035991 0.003435 0.019367 NaN 0.015058 0.628133 0.003569 0.012089 1.000000 0.758754 0.618409 0.769212
YearsInCurrentRole -0.160545 0.212901 0.009932 0.018845 0.060236 NaN -0.008416 0.018007 -0.024106 0.008717 0.389447 -0.002305 0.363818 -0.012815 -0.090754 -0.001520 0.034986 -0.015123 NaN 0.050818 0.460365 -0.005738 0.049856 0.758754 1.000000 0.548056 0.714365
YearsSinceLastPromotion -0.033019 0.216513 -0.033229 0.010029 0.054254 NaN -0.009019 0.016194 -0.026716 -0.024184 0.353885 -0.018214 0.344978 0.001567 -0.036814 -0.022154 0.017896 0.033493 NaN 0.014352 0.404858 -0.002067 0.008941 0.618409 0.548056 1.000000 0.510224
YearsWithCurrManager -0.156199 0.202089 -0.026363 0.014406 0.069065 NaN -0.009197 -0.004999 -0.020123 0.025976 0.375281 -0.027656 0.344079 -0.036746 -0.110319 -0.011985 0.022827 -0.000867 NaN 0.024698 0.459188 -0.004096 0.002759 0.769212 0.714365 0.510224 1.000000

6. Problem Statements¶

In [ ]:
# Perform Test of Hypothesis :- Compare rates for same level male & female, check relationship between 
# categorical variable like Age & Gender / Gender & Education field, Age & Income etc
In [ ]:
df_train6 = pd.read_csv(r'G:\ETLHive Pune\Project\HR-Data Atrition Project\HR-Employee-Attrition-Table 1.csv')
In [ ]:
df_train6.head()
Out[ ]:
Attrition Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 2 Female 94 3 2 Sales Executive 4 Single 5993 19479 8 Y Yes 11 3 1 80 0 8 0 1 6 4 0 5
1 0 49 Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 3 Male 61 2 2 Research Scientist 2 Married 5130 24907 1 Y No 23 4 4 80 1 10 3 3 10 7 1 7
2 1 37 Travel_Rarely 1373 Research & Development 2 2 Other 1 4 4 Male 92 2 1 Laboratory Technician 3 Single 2090 2396 6 Y Yes 15 3 2 80 0 7 3 3 0 0 0 0
3 0 33 Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 4 Female 56 3 1 Research Scientist 3 Married 2909 23159 1 Y Yes 11 3 3 80 0 8 3 3 8 7 3 0
4 0 27 Travel_Rarely 591 Research & Development 2 1 Medical 1 7 1 Male 40 3 1 Laboratory Technician 2 Married 3468 16632 9 Y No 12 3 4 80 1 6 3 3 2 2 2 2
In [ ]:
df_train6.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Attrition                 1470 non-null   int64 
 1   Age                       1470 non-null   int64 
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                  1470 non-null   int64 
 15  JobRole                   1470 non-null   object
 16  JobSatisfaction           1470 non-null   int64 
 17  MaritalStatus             1470 non-null   object
 18  MonthlyIncome             1470 non-null   int64 
 19  MonthlyRate               1470 non-null   int64 
 20  NumCompaniesWorked        1470 non-null   int64 
 21  Over18                    1470 non-null   object
 22  OverTime                  1470 non-null   object
 23  PercentSalaryHike         1470 non-null   int64 
 24  PerformanceRating         1470 non-null   int64 
 25  RelationshipSatisfaction  1470 non-null   int64 
 26  StandardHours             1470 non-null   int64 
 27  StockOptionLevel          1470 non-null   int64 
 28  TotalWorkingYears         1470 non-null   int64 
 29  TrainingTimesLastYear     1470 non-null   int64 
 30  WorkLifeBalance           1470 non-null   int64 
 31  YearsAtCompany            1470 non-null   int64 
 32  YearsInCurrentRole        1470 non-null   int64 
 33  YearsSinceLastPromotion   1470 non-null   int64 
 34  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(27), object(8)
memory usage: 402.1+ KB
In [ ]:
# 1] Comparing BusinessTravel vs Gender
In [ ]:
BG = pd.crosstab(df_train5_cat1.BusinessTravel,df_train6.Gender)
BG
Out[ ]:
Gender Female Male
BusinessTravel
Non-Travel 49 101
Travel_Frequently 117 160
Travel_Rarely 422 621
In [ ]:
from scipy.stats import chi2_contingency
In [ ]:
chi_sqr, p_value, DOF, EXP = chi2_contingency(BG)
In [ ]:
chi_sqr
Out[ ]:
4.031372310350092
In [ ]:
p_value
Out[ ]:
0.13322895625828154
In [ ]:
DOF
Out[ ]:
2
In [ ]:
EXP
Out[ ]:
array([[ 60. ,  90. ],
       [110.8, 166.2],
       [417.2, 625.8]])
In [ ]:
sns.countplot(x=df_train6.BusinessTravel,hue=df_train6.Gender)
Out[ ]:
<AxesSubplot: xlabel='BusinessTravel', ylabel='count'>
In [ ]:
# 2] comparing Gender vs Monthlyrate
In [ ]:
GM = pd.crosstab(df_train6.Gender,df_train6.MonthlyRate)
GM
Out[ ]:
MonthlyRate 2094 2097 2104 2112 2122 2125 2137 2227 2243 2253 2261 2288 2302 2323 2326 2338 2354 2373 2396 2437 2447 2493 2539 2560 2561 2613 2671 2689 2690 2706 2721 2725 2739 2755 2819 2823 2845 2851 2890 2900 2912 2939 2967 2975 2993 2997 3010 3020 3031 3032 3064 3072 3088 3119 3129 3140 3142 3156 3157 3164 3173 3193 3208 3297 3300 3334 3335 3339 3356 3372 3376 3395 3415 3423 3425 3427 3445 3449 3458 3465 3487 3498 3525 3536 3549 3567 3622 3666 3687 3692 3698 3708 3735 3787 3809 3810 3811 3835 3840 3854 ... 24852 24907 24920 24941 24978 25043 25063 25098 25103 25150 25166 25174 25178 25198 25233 25258 25265 25275 25291 25308 25326 25348 25353 25388 25412 25422 25440 25470 25479 25518 25527 25549 25592 25594 25605 25657 25681 25713 25725 25751 25755 25761 25796 25800 25811 25812 25846 25927 25949 25952 25995 26009 26062 26075 26076 26085 26092 26124 26176 26186 26204 26227 26236 26250 26278 26283 26285 26308 26312 26314 26342 26362 26376 26427 26458 26493 26496 26507 26537 26542 26551 26582 26589 26619 26703 26707 26767 26820 26841 26849 26862 26894 26897 26914 26933 26956 26959 26968 26997 26999
Gender
Female 1 0 0 1 0 0 1 1 0 0 1 0 1 1 0 1 1 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 1 1 0 1 1 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 1 ... 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 1 0 1 1 1 1 0 0 0 0 0 1 0 1 1 0 1 1 1 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1 1 0 0 1 0 0 0 0 0 1 1 1 1 0 1 1 1 1 0 1 1 0 0 1 1 0
Male 0 1 1 0 1 2 0 0 1 1 0 1 0 0 1 0 0 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1 0 2 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1 0 0 1 1 1 1 1 0 1 1 0 1 1 2 1 0 0 1 0 0 1 1 1 1 1 0 0 0 1 1 0 1 1 1 1 0 0 0 1 1 1 1 1 1 1 0 ... 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 1 0 0 0 0 1 1 1 1 1 0 1 0 0 1 0 0 0 1 1 1 0 0 1 1 0 0 1 1 1 1 1 1 1 0 1 1 0 1 0 1 1 0 1 1 1 1 1 0 0 1 1 0 0 1 1 0 1 1 1 1 1 0 0 0 0 1 0 0 0 0 1 0 0 1 1 0 0 1

2 rows × 1427 columns

In [ ]:
chi_sqr, p_value, DOF, EXP = chi2_contingency(GM)
In [ ]:
chi_sqr
Out[ ]:
1421.388888888889
In [ ]:
p_value
Out[ ]:
0.5294748183389015
In [ ]:
DOF
Out[ ]:
1426
In [ ]:
# 3] comparing Gender vs Department
In [ ]:
GD = pd.crosstab(df_train6.Gender,df_train6.Department)
GD
Out[ ]:
Department Human Resources Research & Development Sales
Gender
Female 20 379 189
Male 43 582 257
In [ ]:
plt.figure(figsize=(10,9))
sns.countplot(x=df_train6.Gender,hue=df_train6.Department)
Out[ ]:
<AxesSubplot: xlabel='Gender', ylabel='count'>
In [ ]:
# 4] Comparing Gender vs Age
In [ ]:
GA = pd.crosstab(df_train6.Gender,df_train6.Age)
GA
Out[ ]:
Age 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
Gender
Female 4 4 6 6 5 1 11 6 14 22 12 29 26 28 23 24 33 27 29 18 27 16 24 15 16 14 10 17 15 14 7 12 9 7 7 10 15 4 7 0 6 6 2
Male 4 5 5 7 11 13 15 20 25 26 36 39 34 41 38 34 44 51 40 32 31 26 33 25 30 18 23 24 18 10 12 12 21 12 11 9 3 18 7 4 8 4 3
In [ ]:
chi_sqr, p_value, DOF, EXP = chi2_contingency(GA)
In [ ]:
chi_sqr
Out[ ]:
53.537114950915246
In [ ]:
p_value
Out[ ]:
0.10927801670328505
In [ ]:
# 5] comparing Overtime and Age
In [ ]:
OA = pd.crosstab(df_train6.OverTime,df_train6.Age)
OA
Out[ ]:
Age 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
OverTime
No 6 5 7 10 9 11 18 18 27 36 36 49 48 49 50 40 60 52 53 34 40 33 42 27 31 24 18 31 25 14 17 18 17 15 13 14 15 12 11 3 5 7 4
Yes 2 4 4 3 7 3 8 8 12 12 12 19 12 20 11 18 17 26 16 16 18 9 15 13 15 8 15 10 8 10 2 6 13 4 5 5 3 10 3 1 9 3 1
In [ ]:
chi_sqr, p_value, DOF, EXP = chi2_contingency(OA)
In [ ]:
chi_sqr
Out[ ]:
44.366828521428644
In [ ]:
p_value
Out[ ]:
0.37220025607537444
In [ ]:
# 6] Compairing Maritalstatus and Gender
In [ ]:
MG=pd.crosstab(df_train6.MaritalStatus,df_train6.Gender)
MG
Out[ ]:
Gender Female Male
MaritalStatus
Divorced 117 210
Married 272 401
Single 199 271
In [ ]:
chi_sqr, p_value, DOF, EXP = chi2_contingency(MG)
In [ ]:
chi_sqr
Out[ ]:
3.5478394206821307
In [ ]:
p_value
Out[ ]:
0.1696666396487212
In [ ]:
plt.figure(figsize=(10,9))
sns.countplot(x=df_train6.MaritalStatus, hue = df_train6.Gender)
Out[ ]:
<AxesSubplot: xlabel='MaritalStatus', ylabel='count'>
In [ ]:
# Compare age and Monthly income
In [ ]:
AM = pd.crosstab(df_train6.Age,df_train6.MonthlyIncome)
AM.head()
Out[ ]:
MonthlyIncome 1009 1051 1052 1081 1091 1102 1118 1129 1200 1223 1232 1261 1274 1281 1359 1393 1416 1420 1483 1514 1555 1563 1569 1601 1611 1675 1702 1706 1790 1859 1878 1904 1951 2001 2007 2008 2011 2013 2014 2018 2022 2024 2028 2029 2033 2042 2044 2045 2058 2061 2062 2064 2066 2070 2073 2074 2075 2080 2083 2086 2088 2089 2090 2093 2096 2097 2099 2105 2107 2109 2115 2119 2121 2127 2132 2133 2141 2143 2144 2145 2148 2153 2154 2157 2166 2168 2174 2176 2177 2180 2187 2194 2201 2206 2207 2210 2213 2216 2218 2220 ... 16856 16872 16880 16885 16959 17007 17046 17048 17068 17099 17123 17159 17169 17174 17181 17328 17399 17426 17444 17465 17567 17584 17603 17639 17650 17665 17779 17856 17861 17875 17924 18041 18061 18172 18200 18213 18265 18300 18303 18430 18606 18665 18711 18722 18740 18789 18824 18844 18880 18947 19033 19038 19045 19049 19068 19081 19094 19141 19144 19161 19187 19189 19190 19197 19202 19232 19237 19246 19272 19328 19331 19392 19406 19419 19431 19436 19502 19513 19517 19537 19545 19566 19586 19613 19626 19627 19636 19658 19665 19701 19717 19740 19833 19845 19847 19859 19926 19943 19973 19999
Age
18 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
19 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
20 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
21 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
22 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

5 rows × 1349 columns

In [ ]:
chi_sqr, p_value, DOF, EXP = chi2_contingency(AM)
In [ ]:
chi_sqr
Out[ ]:
57003.01607044047
In [ ]:
p_value
Out[ ]:
0.12517635416744993
In [ ]:
# Comparing Educationfield and Gender
In [ ]:
EG = pd.crosstab(df_train6.EducationField,df_train6.Gender)
EG.head()
Out[ ]:
Gender Female Male
EducationField
Human Resources 8 19
Life Sciences 240 366
Marketing 69 90
Medical 190 274
Other 29 53
In [ ]:
chi_sqr,p_value,DOF,EXP = chi2_contingency(EG)
In [ ]:
chi_sqr
Out[ ]:
2.9414238793151797
In [ ]:
p_value
Out[ ]:
0.7090162522843911
In [ ]:
plt.figure(figsize=(10,9))
sns.countplot(x=df_train6.EducationField,hue=df_train6.Gender)
Out[ ]:
<AxesSubplot: xlabel='EducationField', ylabel='count'>
In [ ]:
# Comparing Hourly rate and Age
In [ ]:
ht = pd.crosstab(df_train6.HourlyRate,df_train6.Gender)
ht
Out[ ]:
Gender Female Male
HourlyRate
30 11 8
31 8 7
32 8 16
33 12 7
34 5 7
... ... ...
96 12 15
97 7 14
98 13 15
99 7 13
100 7 12

71 rows × 2 columns

In [ ]:
chi_sqr,p_value,DOF,EXP = chi2_contingency(ht)
In [ ]:
chi_sqr
Out[ ]:
66.92505276398174
In [ ]:
p_value
Out[ ]:
0.5820716429499165

7.1 perform Regression treating Monthly Rate as Y, and choose predictions error and the best model¶

In [ ]:
# Use 1] SLR, 2] MLR, 3] Polynomial Regression, 4] Interaction term, 5] Ridge Regression, 6] Lasso etc

7.1] SLR¶

In [ ]:
DF_S = pd.read_csv(r'G:\ETLHive Pune\Project\HR-Data Atrition Project\HR-Employee-Attrition-Table 1.csv')
DF_S.head()
Out[ ]:
Attrition Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 2 Female 94 3 2 Sales Executive 4 Single 5993 19479 8 Y Yes 11 3 1 80 0 8 0 1 6 4 0 5
1 0 49 Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 3 Male 61 2 2 Research Scientist 2 Married 5130 24907 1 Y No 23 4 4 80 1 10 3 3 10 7 1 7
2 1 37 Travel_Rarely 1373 Research & Development 2 2 Other 1 4 4 Male 92 2 1 Laboratory Technician 3 Single 2090 2396 6 Y Yes 15 3 2 80 0 7 3 3 0 0 0 0
3 0 33 Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 4 Female 56 3 1 Research Scientist 3 Married 2909 23159 1 Y Yes 11 3 3 80 0 8 3 3 8 7 3 0
4 0 27 Travel_Rarely 591 Research & Development 2 1 Medical 1 7 1 Male 40 3 1 Laboratory Technician 2 Married 3468 16632 9 Y No 12 3 4 80 1 6 3 3 2 2 2 2
In [ ]:
# Splitting the datasets into training and testing datasets
In [ ]:
x = DF_S[['DailyRate']]
y = DF_S[['MonthlyRate']]
In [ ]:
x
Out[ ]:
DailyRate
0 1102
1 279
2 1373
3 1392
4 591
... ...
1465 884
1466 613
1467 155
1468 1023
1469 628

1470 rows × 1 columns

In [ ]:
y
Out[ ]:
MonthlyRate
0 19479
1 24907
2 2396
3 23159
4 16632
... ...
1465 12290
1466 21457
1467 5174
1468 13243
1469 10228

1470 rows × 1 columns

In [ ]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.20,random_state=31)
In [ ]:
xtrain
Out[ ]:
DailyRate
815 984
1187 1189
1321 207
728 1441
387 759
... ...
826 433
610 269
894 685
16 334
722 1391

1176 rows × 1 columns

In [ ]:
xtest
Out[ ]:
DailyRate
1343 592
334 549
1136 329
1080 228
396 1473
... ...
91 632
520 817
1403 119
479 1287
1135 563

294 rows × 1 columns

In [ ]:
# Model Building

lin_model = LinearRegression()
lin_model.fit(xtrain,ytrain)
Out[ ]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [ ]:
# Training data evaluation
ypredtrain = lin_model.predict(xtrain)

MAE = mean_absolute_error(ytrain,ypredtrain)
print('Mean absolute error:',MAE)

MSE = mean_squared_error(ytrain,ypredtrain)
print('Mean squared error:',MSE)

RMSE = np.sqrt(MSE)
print('Root mean squared error:',RMSE)

RSquared = r2_score(ytrain,ypredtrain)
print('R-Squared:',RSquared)

AdjRsquared = 1-((1-RSquared)*(len(xtrain)-1)/(len(xtrain)-len(x.columns)-1))
print('AdjRsquared:',AdjRsquared)
Mean absolute error: 6270.772185623379
Mean squared error: 51623223.78228901
Root mean squared error: 7184.930325499964
R-Squared: 0.0010404518598107204
AdjRsquared: 0.0001895493486180344
In [ ]:
# Testing data evaluation
ypredtest = lin_model.predict(xtest)

MAE = mean_absolute_error(ytest,ypredtest)
print('Mean absolute error:',MAE)

MSE = mean_squared_error(ytest,ypredtest)
print('Mean squared error:',MSE)

RMSE = np.sqrt(MSE)
print('Root mean squared error:',RMSE)

RSquared = r2_score(ytest,ypredtest)
print('R-Squared:',RSquared)

AdjRsquared = 1-((1-RSquared)*(len(xtest)-1)/(len(xtest)-len(x.columns)-1))
print('AdjRsquared:',AdjRsquared)
Mean absolute error: 5849.367272215098
Mean squared error: 46390905.9967264
Root mean squared error: 6811.08699083534
R-Squared: 0.0007109055817979826
AdjRsquared: -0.0027113173442918637

7.2 MLR¶

In [ ]:
# Read the data

df_7 = pd.read_csv(r'G:\ETLHive Pune\Project\HR-Data Atrition Project\HR-Employee-Attrition-Table 1.csv')
df_7.head()
Out[ ]:
Attrition Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 2 Female 94 3 2 Sales Executive 4 Single 5993 19479 8 Y Yes 11 3 1 80 0 8 0 1 6 4 0 5
1 0 49 Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 3 Male 61 2 2 Research Scientist 2 Married 5130 24907 1 Y No 23 4 4 80 1 10 3 3 10 7 1 7
2 1 37 Travel_Rarely 1373 Research & Development 2 2 Other 1 4 4 Male 92 2 1 Laboratory Technician 3 Single 2090 2396 6 Y Yes 15 3 2 80 0 7 3 3 0 0 0 0
3 0 33 Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 4 Female 56 3 1 Research Scientist 3 Married 2909 23159 1 Y Yes 11 3 3 80 0 8 3 3 8 7 3 0
4 0 27 Travel_Rarely 591 Research & Development 2 1 Medical 1 7 1 Male 40 3 1 Laboratory Technician 2 Married 3468 16632 9 Y No 12 3 4 80 1 6 3 3 2 2 2 2
In [ ]:
y = df_7[['MonthlyRate']]
In [ ]:
df_7tr = df_7.drop('MonthlyRate',axis=1)
In [ ]:
df_7tr
Out[ ]:
Attrition Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyIncome NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 2 Female 94 3 2 Sales Executive 4 Single 5993 8 Y Yes 11 3 1 80 0 8 0 1 6 4 0 5
1 0 49 Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 3 Male 61 2 2 Research Scientist 2 Married 5130 1 Y No 23 4 4 80 1 10 3 3 10 7 1 7
2 1 37 Travel_Rarely 1373 Research & Development 2 2 Other 1 4 4 Male 92 2 1 Laboratory Technician 3 Single 2090 6 Y Yes 15 3 2 80 0 7 3 3 0 0 0 0
3 0 33 Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 4 Female 56 3 1 Research Scientist 3 Married 2909 1 Y Yes 11 3 3 80 0 8 3 3 8 7 3 0
4 0 27 Travel_Rarely 591 Research & Development 2 1 Medical 1 7 1 Male 40 3 1 Laboratory Technician 2 Married 3468 9 Y No 12 3 4 80 1 6 3 3 2 2 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 0 36 Travel_Frequently 884 Research & Development 23 2 Medical 1 2061 3 Male 41 4 2 Laboratory Technician 4 Married 2571 4 Y No 17 3 3 80 1 17 3 3 5 2 0 3
1466 0 39 Travel_Rarely 613 Research & Development 6 1 Medical 1 2062 4 Male 42 2 3 Healthcare Representative 1 Married 9991 4 Y No 15 3 1 80 1 9 5 3 7 7 1 7
1467 0 27 Travel_Rarely 155 Research & Development 4 3 Life Sciences 1 2064 2 Male 87 4 2 Manufacturing Director 2 Married 6142 1 Y Yes 20 4 2 80 1 6 0 3 6 2 0 3
1468 0 49 Travel_Frequently 1023 Sales 2 3 Medical 1 2065 4 Male 63 2 2 Sales Executive 2 Married 5390 2 Y No 14 3 4 80 0 17 3 2 9 6 0 8
1469 0 34 Travel_Rarely 628 Research & Development 8 3 Medical 1 2068 2 Male 82 4 2 Laboratory Technician 3 Married 4404 2 Y No 12 3 1 80 0 6 3 4 4 3 1 2

1470 rows × 34 columns

In [ ]:
# Seprate Categorical and continious columns
In [ ]:
cat = []
con = []
for i in df_7tr.columns:
    if (df_7tr[i].dtypes == 'object'):
        cat.append(i)
    else:
        con.append(i)
In [ ]:
df_train7_con = con
df_train7_con = df_7tr[con]
df_train7_con
Out[ ]:
Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 1102 1 2 1 1 2 94 3 2 4 5993 8 11 3 1 80 0 8 0 1 6 4 0 5
1 0 49 279 8 1 1 2 3 61 2 2 2 5130 1 23 4 4 80 1 10 3 3 10 7 1 7
2 1 37 1373 2 2 1 4 4 92 2 1 3 2090 6 15 3 2 80 0 7 3 3 0 0 0 0
3 0 33 1392 3 4 1 5 4 56 3 1 3 2909 1 11 3 3 80 0 8 3 3 8 7 3 0
4 0 27 591 2 1 1 7 1 40 3 1 2 3468 9 12 3 4 80 1 6 3 3 2 2 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 0 36 884 23 2 1 2061 3 41 4 2 4 2571 4 17 3 3 80 1 17 3 3 5 2 0 3
1466 0 39 613 6 1 1 2062 4 42 2 3 1 9991 4 15 3 1 80 1 9 5 3 7 7 1 7
1467 0 27 155 4 3 1 2064 2 87 4 2 2 6142 1 20 4 2 80 1 6 0 3 6 2 0 3
1468 0 49 1023 2 3 1 2065 4 63 2 2 2 5390 2 14 3 4 80 0 17 3 2 9 6 0 8
1469 0 34 628 8 3 1 2068 2 82 4 2 3 4404 2 12 3 1 80 0 6 3 4 4 3 1 2

1470 rows × 26 columns

In [ ]:
df_train7_cat = cat
df_train7_cat = df_7tr[cat]
df_train7_cat
Out[ ]:
BusinessTravel Department EducationField Gender JobRole MaritalStatus Over18 OverTime
0 Travel_Rarely Sales Life Sciences Female Sales Executive Single Y Yes
1 Travel_Frequently Research & Development Life Sciences Male Research Scientist Married Y No
2 Travel_Rarely Research & Development Other Male Laboratory Technician Single Y Yes
3 Travel_Frequently Research & Development Life Sciences Female Research Scientist Married Y Yes
4 Travel_Rarely Research & Development Medical Male Laboratory Technician Married Y No
... ... ... ... ... ... ... ... ...
1465 Travel_Frequently Research & Development Medical Male Laboratory Technician Married Y No
1466 Travel_Rarely Research & Development Medical Male Healthcare Representative Married Y No
1467 Travel_Rarely Research & Development Life Sciences Male Manufacturing Director Married Y Yes
1468 Travel_Frequently Sales Medical Male Sales Executive Married Y No
1469 Travel_Rarely Research & Development Medical Male Laboratory Technician Married Y No

1470 rows × 8 columns

In [ ]:
# Visulaisation of categorical columns 
In [ ]:
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_train7_cat.columns):
    if df_train7_cat[i].dtypes=='object':
        plt.subplot(3,3,x1+1)
        sns.countplot(x=df_train7_cat[i])
In [ ]:
# Visualisation continiuos columns
In [ ]:
plt.figure(figsize=(17,19))
for x1,i in enumerate(df_train7_con.columns):
    if df_train7_con[i].dtypes=='int64' or df_train7_con[i].dtypes=='float64':
        plt.subplot(9,3,x1+1)
        sns.boxplot(df_train7_con[i])
In [ ]:
for i in df_train7_con.columns:
    q1 = df_train7_con[i].quantile(0.25)
    q3 = df_train7_con[i].quantile(0.75)
    IQR = q3-q1
    uppertail = q3+1.5*IQR
    lowertail = q1-1.5*IQR
    df_train7_con.loc[(df_train7_con[i]>uppertail) | (df_train7_con[i]<lowertail)]
    mean_7 = df_train7_con[i].mean()
    df_train7_con.loc[(df_train7_con[i]>uppertail) | (df_train7_con[i]<lowertail),i],mean_7
In [ ]:
plt.figure(figsize=(17,19))
for x1,i in enumerate(df_train7_con.columns):
    if df_train7_con[i].dtypes=='int64' or df_train7_con[i].dtypes=='float64':
        plt.subplot(9,3,x1+1)
        sns.boxplot(df_train7_con[i])
In [ ]:
# One hot encoding

df_train7_dum = pd.get_dummies(df_train7_cat)
In [ ]:
df_train7_dum.head()
Out[ ]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently BusinessTravel_Travel_Rarely Department_Human Resources Department_Research & Development Department_Sales EducationField_Human Resources EducationField_Life Sciences EducationField_Marketing EducationField_Medical EducationField_Other EducationField_Technical Degree Gender_Female Gender_Male JobRole_Healthcare Representative JobRole_Human Resources JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive JobRole_Sales Representative MaritalStatus_Divorced MaritalStatus_Married MaritalStatus_Single Over18_Y OverTime_No OverTime_Yes
0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1
1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0
2 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1
3 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1
4 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0
In [ ]:
# Standardization
In [ ]:
std_scaler = StandardScaler()
std_scaler1 = std_scaler.fit_transform(df_train7_con)
In [ ]:
x_lin = pd.DataFrame(std_scaler1,columns=df_train7_con.columns)
In [ ]:
x_lin.head()
Out[ ]:
Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 2.280906 0.446350 0.742527 -1.010909 -0.891688 0.0 -1.701283 -0.660531 1.383138 0.379672 -0.057788 1.153254 -0.108350 2.125136 -1.150554 -0.426230 -1.584178 0.0 -0.932014 -0.421642 -2.171982 -2.493820 -0.164613 -0.063296 -0.679146 0.245834
1 -0.438422 1.322365 -1.297775 -0.147150 -1.868426 0.0 -1.699621 0.254625 -0.240677 -1.026167 -0.057788 -0.660853 -0.291719 -0.678049 2.129306 2.346151 1.191438 0.0 0.241988 -0.164511 0.155707 0.338096 0.488508 0.764998 -0.368715 0.806541
2 2.280906 0.008343 1.414363 -0.887515 -0.891688 0.0 -1.696298 1.169781 1.284725 -1.026167 -0.961486 0.246200 -0.937654 1.324226 -0.057267 -0.426230 -0.658973 0.0 -0.932014 -0.550208 0.155707 0.338096 -1.144294 -1.167687 -0.679146 -1.155935
3 -0.438422 -0.429664 1.461466 -0.764121 1.061787 0.0 -1.694636 1.169781 -0.486709 0.379672 -0.961486 0.246200 -0.763634 -0.678049 -1.150554 -0.426230 0.266233 0.0 -0.932014 -0.421642 0.155707 0.338096 0.161947 0.764998 0.252146 -1.155935
4 -0.438422 -1.086676 -0.524295 -0.887515 -1.868426 0.0 -1.691313 -1.575686 -1.274014 0.379672 -0.961486 -0.660853 -0.644858 2.525591 -0.877232 -0.426230 1.191438 0.0 0.241988 -0.678774 0.155707 0.338096 -0.817734 -0.615492 -0.058285 -0.595227
In [ ]:
df_merge_7 = pd.concat([df_train7_dum,x_lin],axis=1)
df_merge_7
Out[ ]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently BusinessTravel_Travel_Rarely Department_Human Resources Department_Research & Development Department_Sales EducationField_Human Resources EducationField_Life Sciences EducationField_Marketing EducationField_Medical EducationField_Other EducationField_Technical Degree Gender_Female Gender_Male JobRole_Healthcare Representative JobRole_Human Resources JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive JobRole_Sales Representative MaritalStatus_Divorced MaritalStatus_Married MaritalStatus_Single Over18_Y OverTime_No OverTime_Yes Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 1 2.280906 0.446350 0.742527 -1.010909 -0.891688 0.0 -1.701283 -0.660531 1.383138 0.379672 -0.057788 1.153254 -0.108350 2.125136 -1.150554 -0.426230 -1.584178 0.0 -0.932014 -0.421642 -2.171982 -2.493820 -0.164613 -0.063296 -0.679146 0.245834
1 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 -0.438422 1.322365 -1.297775 -0.147150 -1.868426 0.0 -1.699621 0.254625 -0.240677 -1.026167 -0.057788 -0.660853 -0.291719 -0.678049 2.129306 2.346151 1.191438 0.0 0.241988 -0.164511 0.155707 0.338096 0.488508 0.764998 -0.368715 0.806541
2 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 1 2.280906 0.008343 1.414363 -0.887515 -0.891688 0.0 -1.696298 1.169781 1.284725 -1.026167 -0.961486 0.246200 -0.937654 1.324226 -0.057267 -0.426230 -0.658973 0.0 -0.932014 -0.550208 0.155707 0.338096 -1.144294 -1.167687 -0.679146 -1.155935
3 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 1 -0.438422 -0.429664 1.461466 -0.764121 1.061787 0.0 -1.694636 1.169781 -0.486709 0.379672 -0.961486 0.246200 -0.763634 -0.678049 -1.150554 -0.426230 0.266233 0.0 -0.932014 -0.421642 0.155707 0.338096 0.161947 0.764998 0.252146 -1.155935
4 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 -0.438422 -1.086676 -0.524295 -0.887515 -1.868426 0.0 -1.691313 -1.575686 -1.274014 0.379672 -0.961486 -0.660853 -0.644858 2.525591 -0.877232 -0.426230 1.191438 0.0 0.241988 -0.678774 0.155707 0.338096 -0.817734 -0.615492 -0.058285 -0.595227
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 -0.438422 -0.101159 0.202082 1.703764 -0.891688 0.0 1.721670 0.254625 -1.224807 1.785511 -0.057788 1.153254 -0.835451 0.523316 0.489376 -0.426230 0.266233 0.0 0.241988 0.735447 0.155707 0.338096 -0.327893 -0.615492 -0.679146 -0.314873
1466 0 0 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 1 1 0 -0.438422 0.227347 -0.469754 -0.393938 -1.868426 0.0 1.723332 1.169781 -1.175601 -1.026167 0.845911 -1.567907 0.741140 0.523316 -0.057267 -0.426230 -1.584178 0.0 0.241988 -0.293077 1.707500 0.338096 -0.001333 0.764998 -0.368715 0.806541
1467 0 0 1 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 1 -0.438422 -1.086676 -1.605183 -0.640727 0.085049 0.0 1.726655 -0.660531 1.038693 1.785511 -0.057788 -0.660853 -0.076690 -0.678049 1.309341 2.346151 -0.658973 0.0 0.241988 -0.678774 -2.171982 0.338096 -0.164613 -0.615492 -0.679146 -0.314873
1468 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 1 1 0 -0.438422 1.322365 0.546677 -0.887515 0.085049 0.0 1.728317 1.169781 -0.142264 -1.026167 -0.057788 -0.660853 -0.236474 -0.277594 -0.330589 -0.426230 1.191438 0.0 -0.932014 0.735447 0.155707 -1.077862 0.325228 0.488900 -0.679146 1.086895
1469 0 0 1 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 -0.438422 -0.320163 -0.432568 -0.147150 0.085049 0.0 1.733302 -0.660531 0.792660 1.785511 -0.057788 0.246200 -0.445978 -0.277594 -0.877232 -0.426230 -1.584178 0.0 -0.932014 -0.678774 0.155707 1.754054 -0.491174 -0.339394 -0.368715 -0.595227

1470 rows × 55 columns

In [ ]:
# VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif7l = pd.DataFrame()
vif7l['Features'] = df_merge_7.columns
vif7l['VIF'] = [variance_inflation_factor(df_merge_7.values,i) for i in range(df_merge_7.shape[1])]
vif7l
Out[ ]:
Features VIF
0 BusinessTravel_Non-Travel inf
1 BusinessTravel_Travel_Frequently inf
2 BusinessTravel_Travel_Rarely inf
3 Department_Human Resources inf
4 Department_Research & Development inf
5 Department_Sales inf
6 EducationField_Human Resources inf
7 EducationField_Life Sciences inf
8 EducationField_Marketing inf
9 EducationField_Medical inf
10 EducationField_Other inf
11 EducationField_Technical Degree inf
12 Gender_Female inf
13 Gender_Male inf
14 JobRole_Healthcare Representative inf
15 JobRole_Human Resources inf
16 JobRole_Laboratory Technician inf
17 JobRole_Manager inf
18 JobRole_Manufacturing Director inf
19 JobRole_Research Director inf
20 JobRole_Research Scientist inf
21 JobRole_Sales Executive inf
22 JobRole_Sales Representative inf
23 MaritalStatus_Divorced inf
24 MaritalStatus_Married inf
25 MaritalStatus_Single inf
26 Over18_Y 0.000000
27 OverTime_No inf
28 OverTime_Yes inf
29 Attrition 1.347205
30 Age 2.093292
31 DailyRate 1.038089
32 DistanceFromHome 1.031397
33 Education 1.084381
34 EmployeeCount NaN
35 EmployeeNumber 1.035822
36 EnvironmentSatisfaction 1.048560
37 HourlyRate 1.027678
38 JobInvolvement 1.049355
39 JobLevel 14.152924
40 JobSatisfaction 1.043983
41 MonthlyIncome 18.134627
42 NumCompaniesWorked 1.300624
43 PercentSalaryHike 2.565100
44 PerformanceRating 2.550958
45 RelationshipSatisfaction 1.038662
46 StandardHours NaN
47 StockOptionLevel 1.923492
48 TotalWorkingYears 5.019051
49 TrainingTimesLastYear 1.040440
50 WorkLifeBalance 1.034209
51 YearsAtCompany 4.760278
52 YearsInCurrentRole 2.805982
53 YearsSinceLastPromotion 1.729502
54 YearsWithCurrManager 2.856316
In [ ]:
featuresdrop = vif7l.loc[vif7l['VIF']>10]
droplist = featurestodrop['Features']
droplist = list(droplist)
len(droplist)
print(droplist)
['BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely', 'Department_Human Resources', 'Department_Research & Development', 'Department_Sales', 'EducationField_Human Resources', 'EducationField_Life Sciences', 'EducationField_Marketing', 'EducationField_Medical', 'EducationField_Other', 'EducationField_Technical Degree', 'Gender_Female', 'Gender_Male', 'JobRole_Healthcare Representative', 'JobRole_Human Resources', 'JobRole_Laboratory Technician', 'JobRole_Manager', 'JobRole_Manufacturing Director', 'JobRole_Research Director', 'JobRole_Research Scientist', 'JobRole_Sales Executive', 'JobRole_Sales Representative', 'MaritalStatus_Divorced', 'MaritalStatus_Married', 'MaritalStatus_Single', 'OverTime_No', 'OverTime_Yes']
In [ ]:
drop_list7 = ['BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently', 'BusinessTravel_Travel_Rarely', 
              'Department_Human Resources', 'Department_Research & Development', 'Department_Sales',
              'EducationField_Human Resources','EducationField_Life Sciences', 'EducationField_Marketing', 
              'EducationField_Medical', 'EducationField_Other','EducationField_Technical Degree', 'Gender_Female', 
              'Gender_Male', 'JobRole_Healthcare Representative','JobRole_Human Resources', 
              'JobRole_Laboratory Technician', 'JobRole_Manager', 'JobRole_Manufacturing Director', 
              'JobRole_Research Director', 'JobRole_Research Scientist', 'JobRole_Sales Executive', 
              'JobRole_Sales Representative','MaritalStatus_Divorced', 'MaritalStatus_Married','MaritalStatus_Single', 
              'OverTime_No', 'OverTime_Yes']
In [ ]:
df_lin_7 = df_merge_7.drop(drop_list7,axis=1)
In [ ]:
df_lin_7
Out[ ]:
Over18_Y Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 2.280906 0.446350 0.742527 -1.010909 -0.891688 0.0 -1.701283 -0.660531 1.383138 0.379672 -0.057788 1.153254 -0.108350 2.125136 -1.150554 -0.426230 -1.584178 0.0 -0.932014 -0.421642 -2.171982 -2.493820 -0.164613 -0.063296 -0.679146 0.245834
1 1 -0.438422 1.322365 -1.297775 -0.147150 -1.868426 0.0 -1.699621 0.254625 -0.240677 -1.026167 -0.057788 -0.660853 -0.291719 -0.678049 2.129306 2.346151 1.191438 0.0 0.241988 -0.164511 0.155707 0.338096 0.488508 0.764998 -0.368715 0.806541
2 1 2.280906 0.008343 1.414363 -0.887515 -0.891688 0.0 -1.696298 1.169781 1.284725 -1.026167 -0.961486 0.246200 -0.937654 1.324226 -0.057267 -0.426230 -0.658973 0.0 -0.932014 -0.550208 0.155707 0.338096 -1.144294 -1.167687 -0.679146 -1.155935
3 1 -0.438422 -0.429664 1.461466 -0.764121 1.061787 0.0 -1.694636 1.169781 -0.486709 0.379672 -0.961486 0.246200 -0.763634 -0.678049 -1.150554 -0.426230 0.266233 0.0 -0.932014 -0.421642 0.155707 0.338096 0.161947 0.764998 0.252146 -1.155935
4 1 -0.438422 -1.086676 -0.524295 -0.887515 -1.868426 0.0 -1.691313 -1.575686 -1.274014 0.379672 -0.961486 -0.660853 -0.644858 2.525591 -0.877232 -0.426230 1.191438 0.0 0.241988 -0.678774 0.155707 0.338096 -0.817734 -0.615492 -0.058285 -0.595227
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 1 -0.438422 -0.101159 0.202082 1.703764 -0.891688 0.0 1.721670 0.254625 -1.224807 1.785511 -0.057788 1.153254 -0.835451 0.523316 0.489376 -0.426230 0.266233 0.0 0.241988 0.735447 0.155707 0.338096 -0.327893 -0.615492 -0.679146 -0.314873
1466 1 -0.438422 0.227347 -0.469754 -0.393938 -1.868426 0.0 1.723332 1.169781 -1.175601 -1.026167 0.845911 -1.567907 0.741140 0.523316 -0.057267 -0.426230 -1.584178 0.0 0.241988 -0.293077 1.707500 0.338096 -0.001333 0.764998 -0.368715 0.806541
1467 1 -0.438422 -1.086676 -1.605183 -0.640727 0.085049 0.0 1.726655 -0.660531 1.038693 1.785511 -0.057788 -0.660853 -0.076690 -0.678049 1.309341 2.346151 -0.658973 0.0 0.241988 -0.678774 -2.171982 0.338096 -0.164613 -0.615492 -0.679146 -0.314873
1468 1 -0.438422 1.322365 0.546677 -0.887515 0.085049 0.0 1.728317 1.169781 -0.142264 -1.026167 -0.057788 -0.660853 -0.236474 -0.277594 -0.330589 -0.426230 1.191438 0.0 -0.932014 0.735447 0.155707 -1.077862 0.325228 0.488900 -0.679146 1.086895
1469 1 -0.438422 -0.320163 -0.432568 -0.147150 0.085049 0.0 1.733302 -0.660531 0.792660 1.785511 -0.057788 0.246200 -0.445978 -0.277594 -0.877232 -0.426230 -1.584178 0.0 -0.932014 -0.678774 0.155707 1.754054 -0.491174 -0.339394 -0.368715 -0.595227

1470 rows × 27 columns

In [ ]:
# Splitting the datasets
xtrain,xtest,ytrain,ytest = train_test_split(df_lin_7,y,test_size=0.25,random_state=31)
In [ ]:
# Model Building

lin_model = LinearRegression()
lin_model.fit(xtrain,ytrain)
Out[ ]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [ ]:
# Training data evaluation
ypredtrain = lin_model.predict(xtrain)

MAE = mean_absolute_error(ytrain,ypredtrain)
print('Mean absolute error:',MAE)

MSE = mean_squared_error(ytrain,ypredtrain)
print('Mean squared error:',MSE)

RMSE = np.sqrt(MSE)
print('Root mean squared error:',RMSE)

RSquared = r2_score(ytrain,ypredtrain)
print('R-Squared:',RSquared)

AdjRsquared = 1-((1-RSquared)*(len(xtrain)-1)/(len(xtrain)-len(x_lin.columns)-1))
print('AdjRsquared:',AdjRsquared)
Mean absolute error: 6207.680039002682
Mean squared error: 50822766.188194014
Root mean squared error: 7129.008780201776
R-Squared: 0.012529315004031982
AdjRsquared: -0.011353696912149713
In [ ]:
# testing data evaluation
ypredtest = lin_model.predict(xtest)

MAE = mean_absolute_error(ytest,ypredtest)
print('Mean absolute error:',MAE)

MSE = mean_squared_error(ytest,ypredtest)
print('Mean squared error:',MSE)

RMSE = np.sqrt(MSE)
print('Root mean squared error:',RMSE)

RSquared = r2_score(ytest,ypredtest)
print('R-Squared:',RSquared)

AdjRsquared = 1-((1-RSquared)*(len(xtest)-1)/(len(xtest)-len(x_lin.columns)-1))
print('AdjRsquared:',AdjRsquared)
Mean absolute error: 6021.015273843959
Mean squared error: 48787827.28982247
Root mean squared error: 6984.828365094053
R-Squared: -0.01442841066218259
AdjRsquared: -0.09177485839595612

7.5 Lasso¶

In [ ]:
lasso = Lasso()
lasso.fit(xtrain,ytrain)
Out[ ]:
Lasso()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Lasso()
In [ ]:
# Testing Data Evaluation
ypredtest = lasso.predict(xtest)

Mean_squared_error = mean_squared_error(ytest,ypredtest)
print('Mean Squared Error:',Mean_squared_error)

r2Score = r2_score(ytest,ypredtest)
print('R2-Score:',r2Score)
Mean Squared Error: 48774725.24318157
R2-Score: -0.014155984340112893
In [ ]:
# training data evaluation
ypredtrain = lasso.predict(xtrain)

Mean_squared_error = mean_squared_error(ytrain,ypredtrain)
print('Mean Squared Error:',Mean_squared_error)

r2Score = r2_score(ytrain,ypredtrain)
print('R2-Score:',r2Score)
Mean Squared Error: 50822852.237603806
R2-Score: 0.01252764309051757

7.6 Ridge¶

In [ ]:
ridge = Ridge()
ridge.fit(xtrain,ytrain)
Out[ ]:
Ridge()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Ridge()
In [ ]:
# Testing Data Evaluation
ypredtest = ridge.predict(xtest)

Mean_squared_error = mean_squared_error(ytest,ypredtest)
print('Mean Squared Error:',Mean_squared_error)

r2Score = r2_score(ytest,ypredtest)
print('R2-Score:',r2Score)
Mean Squared Error: 48784711.04356809
R2-Score: -0.01436361563213806
In [ ]:
#Training Data Evaluation
ypredtrain = ridge.predict(xtrain)

Mean_squared_error = mean_squared_error(ytrain,ypredtrain)
print('Mean Squared Error:',Mean_squared_error)

r2Score = r2_score(ytrain,ypredtrain)
print('R2-Score:',r2Score)
Mean Squared Error: 50822774.823542684
R2-Score: 0.012529147221868486

7.2 Use KNN¶

In [ ]:
xtrain,xtest,ytrain,ytest = train_test_split(df_lin_7,y,test_size=0.25,random_state=31)
In [ ]:
xtrain
Out[ ]:
Over18_Y Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
1074 1 -0.438422 -0.429664 -0.710227 -0.147150 2.038524 0.0 0.814421 1.169781 0.152975 0.379672 -0.057788 0.246200 -0.024420 -0.277594 0.489376 -0.42623 -1.584178 0.0 -0.932014 0.349751 2.483396 0.338096 -1.144294 -1.167687 -0.679146 -1.155935
1458 1 -0.438422 -0.210661 -1.277942 -1.010909 1.061787 0.0 1.706715 0.254625 -0.191470 -2.432006 -0.961486 1.153254 -0.749185 -0.678049 -0.877232 -0.42623 1.191438 0.0 0.241988 -0.935905 1.707500 0.338096 -0.491174 -0.339394 -0.368715 -0.875581
414 1 2.280906 -1.415181 1.600296 -1.010909 -1.868426 0.0 -0.782403 -1.575686 -0.191470 0.379672 -0.961486 -0.660853 -0.701377 -0.678049 0.216054 -0.42623 -0.658973 0.0 -0.932014 -0.678774 0.931603 0.338096 -0.327893 -0.339394 -0.368715 -0.034520
1371 1 -0.438422 2.088878 1.587900 0.223033 2.038524 0.0 1.512305 1.169781 1.137106 -1.026167 -0.057788 -1.567907 -0.238599 0.523316 0.216054 -0.42623 0.266233 0.0 0.241988 -0.678774 0.155707 0.338096 -1.144294 -1.167687 -0.679146 -1.155935
1140 1 -0.438422 0.774856 1.265617 -0.270544 0.085049 0.0 0.968953 -0.660531 -1.716872 0.379672 2.653309 1.153254 2.665772 -1.078504 -0.330589 -0.42623 1.191438 0.0 0.241988 1.506840 0.931603 -1.077862 2.447870 0.764998 -0.368715 1.647603
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
826 1 -0.438422 0.117845 -0.915993 -1.010909 0.085049 0.0 0.211251 0.254625 -1.421633 1.785511 -0.961486 0.246200 -0.777445 -0.678049 -0.603911 -0.42623 1.191438 0.0 0.241988 -0.550208 -0.620189 1.754054 -0.001333 0.488900 0.873006 -1.155935
610 1 -0.438422 -1.086676 -1.322566 -0.517332 -1.868426 0.0 -0.300531 0.254625 -1.175601 -1.026167 0.845911 1.153254 1.339692 -0.678049 0.216054 -0.42623 -0.658973 0.0 0.241988 -0.293077 0.155707 0.338096 0.325228 1.041095 -0.679146 1.086895
894 1 -0.438422 1.869874 -0.291259 -0.764121 0.085049 0.0 0.374090 1.169781 0.940280 0.379672 1.749610 1.153254 2.395924 0.122861 -0.330589 -0.42623 -1.584178 0.0 -0.932014 3.178192 -0.620189 0.338096 0.488508 1.317193 -0.679146 1.367249
16 1 -0.438422 -0.539166 -1.161424 -0.517332 -0.891688 0.0 -1.668050 -1.575686 0.694247 1.785511 -0.961486 -0.660853 -0.680979 -1.078504 -0.877232 -0.42623 1.191438 0.0 1.415991 -0.550208 1.707500 -1.077862 -0.164613 -0.615492 -0.679146 0.245834
722 1 -0.438422 0.117845 1.458987 0.099639 -1.868426 0.0 -0.031347 0.254625 0.005356 0.379672 -0.961486 0.246200 -0.811441 -1.078504 0.489376 -0.42623 -0.658973 0.0 0.241988 -1.064470 -2.171982 -1.077862 -0.817734 -0.891589 -0.679146 -0.595227

1102 rows × 27 columns

In [ ]:
xtest
Out[ ]:
Over18_Y Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
1343 1 -0.438422 -0.867672 -0.521816 -0.270544 0.085049 0.0 1.425900 1.169781 -0.339090 0.379672 -0.961486 -1.567907 -0.943603 0.122861 -0.330589 -0.426230 -0.658973 0.0 -0.932014 -0.035946 -0.620189 0.338096 -0.654454 -0.615492 -0.368715 -0.595227
334 1 -0.438422 0.884358 -0.628417 -0.147150 1.061787 0.0 -0.951889 1.169781 0.448214 0.379672 -0.057788 1.153254 -0.596200 2.525591 -0.330589 -0.426230 -1.584178 0.0 1.415991 0.092620 0.155707 0.338096 0.488508 1.317193 2.114728 1.086895
1136 1 2.280906 -0.977174 -1.173819 1.827158 0.085049 0.0 0.962306 0.254625 -0.732742 0.379672 -0.961486 -0.660853 -0.870085 -0.678049 0.489376 -0.426230 0.266233 0.0 2.589994 -1.321601 0.155707 0.338096 -0.981014 -0.891589 -0.679146 -1.155935
1080 1 -0.438422 0.993860 -1.424209 -0.764121 0.085049 0.0 0.834361 0.254625 -0.732742 0.379672 1.749610 -0.660853 2.146686 2.125136 -0.877232 -0.426230 1.191438 0.0 0.241988 1.506840 -0.620189 1.754054 0.978348 2.145487 0.873006 -0.875581
396 1 -0.438422 0.665354 1.662273 -0.147150 1.061787 0.0 -0.828928 0.254625 0.399008 0.379672 -0.057788 0.246200 -0.420906 0.523316 -0.330589 -0.426230 1.191438 0.0 -0.932014 -0.421642 0.155707 0.338096 -0.327893 -0.615492 -0.679146 -0.595227
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
808 1 -0.438422 -0.867672 0.754922 2.320735 1.061787 0.0 0.158078 0.254625 1.333932 0.379672 -0.961486 1.153254 -0.847563 0.523316 1.855984 2.346151 -1.584178 0.0 0.241988 -0.035946 -1.396086 0.338096 -0.001333 0.212802 -0.368715 0.806541
1445 1 -0.438422 0.446350 -0.546607 2.320735 1.061787 0.0 1.676806 -1.575686 -0.289883 -1.026167 1.749610 -0.660853 1.501601 -1.078504 2.129306 2.346151 0.266233 0.0 0.241988 1.249709 0.155707 0.338096 2.121310 0.764998 -0.679146 1.647603
1298 1 2.280906 0.993860 -1.342398 1.456975 -0.891688 0.0 1.322879 1.169781 0.005356 0.379672 -0.057788 -0.660853 0.514850 0.523316 1.855984 2.346151 1.191438 0.0 0.241988 0.221185 -0.620189 1.754054 0.325228 0.764998 0.252146 0.806541
29 1 -0.438422 0.993860 -0.241677 -0.887515 1.061787 0.0 -1.639803 -0.660531 0.841867 0.379672 2.653309 -1.567907 2.644099 0.122861 -0.877232 -0.426230 1.191438 0.0 -0.932014 1.378275 -0.620189 -1.077862 -0.817734 -0.615492 -0.058285 -0.875581
867 1 -0.438422 1.431867 1.533360 -0.887515 0.085049 0.0 0.315933 1.169781 -1.766079 0.379672 1.749610 -1.567907 2.412285 -0.277594 1.855984 2.346151 0.266233 0.0 0.241988 2.663930 0.155707 0.338096 -0.817734 -0.615492 -0.058285 -0.595227

368 rows × 27 columns

In [ ]:
ytrain
Out[ ]:
MonthlyRate
1074 22049
1458 8952
414 21972
1371 20328
1140 3549
... ...
826 6004
610 8842
894 23474
16 15053
722 12127

1102 rows × 1 columns

In [ ]:
knn_model = KNeighborsRegressor()
knn_model.fit(xtrain,ytrain)
Out[ ]:
KNeighborsRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsRegressor()
In [ ]:
# Training data evaluation

ypredtrain = knn_model.predict(xtrain)

MAE = mean_absolute_error(ytrain,ypredtrain)
print('MAE:', MAE)

MSE = mean_squared_error(ytrain,ypredtrain)
print('MSE:',MSE)

RMSE = np.sqrt(MSE)
print('RMSE:',RMSE)

RS = r2_score(ytrain,ypredtrain)
print('RS:',RS)

ADJS = 1-((1*RS)*(len(xtrain)-1)/(len(xtrain)-len(x_lin.columns)-1))
print('ADJS:',ADJS)
MAE: 5516.821052631579
MSE: 42177305.469255894
RMSE: 6494.405705625103
RS: 0.18050795250327756
ADJS: 0.8151262737617595
In [ ]:
# testing data evaluation

ypredtest = knn_model.predict(xtest)

MAE = mean_absolute_error(ytest,ypredtest)
print('MAE:', MAE)

MSE = mean_squared_error(ytest,ypredtest)
print('MSE:',MSE)

RMSE = np.sqrt(MSE)
print('RMSE:',RMSE)

RS = r2_score(ytest,ypredtest)
print('RS:',RS)

ADJS = 1-((1*RS)*(len(xtest)-1)/(len(xtest)-len(x_lin.columns)-1))
print('ADJS:',ADJS)
MAE: 6257.771739130435
MSE: 56677783.95021739
RMSE: 7528.464913262025
RS: -0.17848154932014415
ADJS: 1.1920901131979265
In [ ]:
min_max_scaler = MinMaxScaler()
array = min_max_scaler.fit_transform(x_lin)
df_norm = pd.DataFrame(array,columns=x_lin.columns)
df_norm
Out[ ]:
Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement JobLevel JobSatisfaction MonthlyIncome NumCompaniesWorked PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1.0 0.547619 0.715820 0.000000 0.25 0.0 0.000000 0.333333 0.914286 0.666667 0.25 1.000000 0.262454 0.888889 0.000000 0.0 0.000000 0.0 0.000000 0.200 0.000000 0.000000 0.150 0.222222 0.000000 0.294118
1 0.0 0.738095 0.126700 0.250000 0.00 0.0 0.000484 0.666667 0.442857 0.333333 0.25 0.333333 0.217009 0.111111 0.857143 1.0 1.000000 0.0 0.333333 0.250 0.500000 0.666667 0.250 0.388889 0.066667 0.411765
2 1.0 0.452381 0.909807 0.035714 0.25 0.0 0.001451 1.000000 0.885714 0.333333 0.00 0.666667 0.056925 0.666667 0.285714 0.0 0.333333 0.0 0.000000 0.175 0.500000 0.666667 0.000 0.000000 0.000000 0.000000
3 0.0 0.357143 0.923407 0.071429 0.75 0.0 0.001935 1.000000 0.371429 0.666667 0.00 0.666667 0.100053 0.111111 0.000000 0.0 0.666667 0.0 0.000000 0.200 0.500000 0.666667 0.200 0.388889 0.200000 0.000000
4 0.0 0.214286 0.350036 0.035714 0.00 0.0 0.002903 0.000000 0.142857 0.666667 0.00 0.333333 0.129489 1.000000 0.071429 0.0 1.000000 0.0 0.333333 0.150 0.500000 0.666667 0.050 0.111111 0.133333 0.117647
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 0.0 0.428571 0.559771 0.785714 0.25 0.0 0.996613 0.666667 0.157143 1.000000 0.25 1.000000 0.082254 0.444444 0.428571 0.0 0.666667 0.0 0.333333 0.425 0.500000 0.666667 0.125 0.111111 0.000000 0.176471
1466 0.0 0.500000 0.365784 0.178571 0.00 0.0 0.997097 1.000000 0.171429 0.333333 0.50 0.000000 0.472986 0.444444 0.285714 0.0 0.000000 0.0 0.333333 0.225 0.833333 0.666667 0.175 0.388889 0.066667 0.411765
1467 0.0 0.214286 0.037938 0.107143 0.50 0.0 0.998065 0.333333 0.814286 1.000000 0.25 0.333333 0.270300 0.111111 0.642857 1.0 0.333333 0.0 0.333333 0.150 0.000000 0.666667 0.150 0.111111 0.000000 0.176471
1468 0.0 0.738095 0.659270 0.035714 0.50 0.0 0.998549 1.000000 0.471429 0.333333 0.25 0.333333 0.230700 0.222222 0.214286 0.0 1.000000 0.0 0.000000 0.425 0.500000 0.333333 0.225 0.333333 0.000000 0.470588
1469 0.0 0.380952 0.376521 0.250000 0.50 0.0 1.000000 0.333333 0.742857 1.000000 0.25 0.666667 0.178778 0.222222 0.071429 0.0 0.000000 0.0 0.000000 0.150 0.500000 1.000000 0.100 0.166667 0.066667 0.117647

1470 rows × 26 columns

In [ ]:
xtrain,xtest,ytrain,ytest = train_test_split(df_norm,y,test_size=0.25,random_state=25)
In [ ]:
knn_model = KNeighborsRegressor()
hyp_grid = {'n_neighbors':np.arange(1,40),'p':[1,2]}

gscv_knn_model = GridSearchCV(knn_model,hyp_grid,cv=5)
gscv_knn_model.fit(xtrain,ytrain)
gscv_knn_model.best_estimator_
Out[ ]:
KNeighborsRegressor(n_neighbors=37)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsRegressor(n_neighbors=37)
In [ ]:
knn_model = KNeighborsRegressor(n_neighbors=37, p=1)
knn_model.fit(xtrain,ytrain)
Out[ ]:
KNeighborsRegressor(n_neighbors=37, p=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsRegressor(n_neighbors=37, p=1)
In [ ]:
# Testing data evaluation
ypredtest = knn_model.predict(xtest)

MAE = mean_absolute_error(ytest,ypredtest)
print('MAE:',MAE)

MSE = mean_squared_error(ytest,ypredtest)
print('MSE:',MSE)

RMSE = np.sqrt(MSE)
print('RMSE',RMSE)

RS = r2_score(ytest,ypredtest)
print('RS:',RS)

ADJS = 1-((1-RS)*(len(xtest)-1)/(len(xtest)-len(x_lin.columns)-1))
print('ADJS:',ADJS)
MAE: 6245.193375440658
MSE: 51160933.41318441
RMSE 7152.687146323709
RS: -0.022467704993947013
ADJS: -0.10042711945096339

7.3] Decision Tree¶

In [ ]:
xtrain,xtest,ytraon,ytest = train_test_split(df_lin_7,y,test_size=0.25,random_state=31)
In [ ]:
# Model building
dt_reg = DecisionTreeRegressor()
dt_reg.fit(xtrain,ytrain)
Out[ ]:
DecisionTreeRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor()
In [ ]:
# training data evaluation
ypredtrain = dt_reg.predict(xtrain)

mse = mean_squared_error(ytrain, ypredtrain)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

r2 = r2_score(ytrain, ypredtrain)
print("R2 :",r2)
MSE : 0.0
RMSE : 0.0
R2 : 1.0
In [ ]:
# testing data evaluation
ypred = dt_reg.predict(xtest)

mse = mean_squared_error(ytest, ypred)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

r2 = r2_score(ytest, ypred)
print("R2 :",r2)
MSE : 96889957.37228261
RMSE : 9843.269648459429
R2 : -1.014599356565217

Problem Statements¶

8 From Classification Model Using Y = attrition and choose the best model logistic regression¶

In [10]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegression
from sklearn.neighbors import KNeighborsRegressor,KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,recall_score,f1_score,precision_score

import tensorflow as tf
import keras

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')
In [ ]:
df_log = pd.read_csv('G:\ETLHive Pune\Project\HR-Data Atrition Project\HR-Employee-Attrition-Table 1.csv')
df_log.head()
Out[ ]:
Attrition Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender HourlyRate JobInvolvement JobLevel JobRole JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate NumCompaniesWorked Over18 OverTime PercentSalaryHike PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 2 Female 94 3 2 Sales Executive 4 Single 5993 19479 8 Y Yes 11 3 1 80 0 8 0 1 6 4 0 5
1 0 49 Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 3 Male 61 2 2 Research Scientist 2 Married 5130 24907 1 Y No 23 4 4 80 1 10 3 3 10 7 1 7
2 1 37 Travel_Rarely 1373 Research & Development 2 2 Other 1 4 4 Male 92 2 1 Laboratory Technician 3 Single 2090 2396 6 Y Yes 15 3 2 80 0 7 3 3 0 0 0 0
3 0 33 Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 4 Female 56 3 1 Research Scientist 3 Married 2909 23159 1 Y Yes 11 3 3 80 0 8 3 3 8 7 3 0
4 0 27 Travel_Rarely 591 Research & Development 2 1 Medical 1 7 1 Male 40 3 1 Laboratory Technician 2 Married 3468 16632 9 Y No 12 3 4 80 1 6 3 3 2 2 2 2
In [11]:
import pandas as pd
from google.colab import files
uploaded = files.upload()
import io
df_log = pd.read_csv(io.BytesIO(uploaded['HR-Employee-Attrition-Table 1.csv']))
df_log.head()
Upload widget is only available when the cell has been executed in the current browser session. Please rerun this cell to enable.
Saving HR-Employee-Attrition-Table 1.csv to HR-Employee-Attrition-Table 1 (2).csv
Out[11]:
Attrition Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 ... 1 80 0 8 0 1 6 4 0 5
1 0 49 Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 ... 4 80 1 10 3 3 10 7 1 7
2 1 37 Travel_Rarely 1373 Research & Development 2 2 Other 1 4 ... 2 80 0 7 3 3 0 0 0 0
3 0 33 Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 ... 3 80 0 8 3 3 8 7 3 0
4 0 27 Travel_Rarely 591 Research & Development 2 1 Medical 1 7 ... 4 80 1 6 3 3 2 2 2 2

5 rows × 35 columns

In [12]:
y_log = df_log['Attrition']
In [ ]:
# Seprate Categorical and continious columns
In [13]:
cat = []
con = []
for i in df_log.columns:
    if (df_log[i].dtypes == 'object'):
        cat.append(i)
    else:
        con.append(i)
In [14]:
df_train_cat = cat
df_train_cat = df_log[cat]
df_train_cat
Out[14]:
BusinessTravel Department EducationField Gender JobRole MaritalStatus Over18 OverTime
0 Travel_Rarely Sales Life Sciences Female Sales Executive Single Y Yes
1 Travel_Frequently Research & Development Life Sciences Male Research Scientist Married Y No
2 Travel_Rarely Research & Development Other Male Laboratory Technician Single Y Yes
3 Travel_Frequently Research & Development Life Sciences Female Research Scientist Married Y Yes
4 Travel_Rarely Research & Development Medical Male Laboratory Technician Married Y No
... ... ... ... ... ... ... ... ...
1465 Travel_Frequently Research & Development Medical Male Laboratory Technician Married Y No
1466 Travel_Rarely Research & Development Medical Male Healthcare Representative Married Y No
1467 Travel_Rarely Research & Development Life Sciences Male Manufacturing Director Married Y Yes
1468 Travel_Frequently Sales Medical Male Sales Executive Married Y No
1469 Travel_Rarely Research & Development Medical Male Laboratory Technician Married Y No

1470 rows × 8 columns

In [15]:
df_train_con = con
df_train_con=df_log[con]
df_train_con
Out[15]:
Attrition Age DailyRate DistanceFromHome Education EmployeeCount EmployeeNumber EnvironmentSatisfaction HourlyRate JobInvolvement ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 1 41 1102 1 2 1 1 2 94 3 ... 1 80 0 8 0 1 6 4 0 5
1 0 49 279 8 1 1 2 3 61 2 ... 4 80 1 10 3 3 10 7 1 7
2 1 37 1373 2 2 1 4 4 92 2 ... 2 80 0 7 3 3 0 0 0 0
3 0 33 1392 3 4 1 5 4 56 3 ... 3 80 0 8 3 3 8 7 3 0
4 0 27 591 2 1 1 7 1 40 3 ... 4 80 1 6 3 3 2 2 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 0 36 884 23 2 1 2061 3 41 4 ... 3 80 1 17 3 3 5 2 0 3
1466 0 39 613 6 1 1 2062 4 42 2 ... 1 80 1 9 5 3 7 7 1 7
1467 0 27 155 4 3 1 2064 2 87 4 ... 2 80 1 6 0 3 6 2 0 3
1468 0 49 1023 2 3 1 2065 4 63 2 ... 4 80 0 17 3 2 9 6 0 8
1469 0 34 628 8 3 1 2068 2 82 4 ... 1 80 0 6 3 4 4 3 1 2

1470 rows × 27 columns

In [ ]:
# Removing Outliers
In [ ]:
# Visualisation of continious columns
In [16]:
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_train_con.columns):
    if df_train_con[i].dtypes == 'int64' or df_train_con[i].dtypes == 'float64':
        plt.subplot(9,3,x1+1)
        sns.boxplot(df_train_con[i])
In [17]:
for i in df_train_con.columns:
    q1 = df_train_con[i].quantile(0.25)
    q3 = df_train_con[i].quantile(0.75)
    IQR = q3-q1
    uppertail = q3+1.5*IQR
    lowertail = q1-1.5*IQR
    df_train_con.loc[(df_train_con[i]>uppertail)|(df_train_con[i]<lowertail)]
    mean_1= df_train_con[i].mean()
    df_train_con.loc[(df_train_con[i]>uppertail)| (df_train_con[i]<lowertail),i]=mean_1
In [18]:
plt.figure(figsize=(15,19))
for x1,i in enumerate(df_train_con.columns):
    if df_train_con[i].dtypes=='int64' or df_train_con[i].dtypes=='float64':
        plt.subplot(9,3,x1+1)
        sns.boxplot(df_train_con[i])
In [19]:
# One hot Encoding
df_log_dum = pd.get_dummies(df_train_cat)
In [20]:
df_log_dum
Out[20]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently BusinessTravel_Travel_Rarely Department_Human Resources Department_Research & Development Department_Sales EducationField_Human Resources EducationField_Life Sciences EducationField_Marketing EducationField_Medical ... JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive JobRole_Sales Representative MaritalStatus_Divorced MaritalStatus_Married MaritalStatus_Single Over18_Y OverTime_No OverTime_Yes
0 0 0 1 0 0 1 0 1 0 0 ... 0 0 1 0 0 0 1 1 0 1
1 0 1 0 0 1 0 0 1 0 0 ... 0 1 0 0 0 1 0 1 1 0
2 0 0 1 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 1 1 0 1
3 0 1 0 0 1 0 0 1 0 0 ... 0 1 0 0 0 1 0 1 0 1
4 0 0 1 0 1 0 0 0 0 1 ... 0 0 0 0 0 1 0 1 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 0 1 0 0 1 0 0 0 0 1 ... 0 0 0 0 0 1 0 1 1 0
1466 0 0 1 0 1 0 0 0 0 1 ... 0 0 0 0 0 1 0 1 1 0
1467 0 0 1 0 1 0 0 1 0 0 ... 0 0 0 0 0 1 0 1 0 1
1468 0 1 0 0 0 1 0 0 0 1 ... 0 0 1 0 0 1 0 1 1 0
1469 0 0 1 0 1 0 0 0 0 1 ... 0 0 0 0 0 1 0 1 1 0

1470 rows × 29 columns

In [21]:
# Standardisation

std_scaler = StandardScaler()
std_scaler1 = std_scaler.fit_transform(df_train_con)
In [22]:
x_log = pd.DataFrame(std_scaler1,columns=df_train_con.columns)
In [23]:
x_log.shape
Out[23]:
(1470, 27)
In [24]:
df_final_log = pd.concat([df_log_dum,x_log],axis=1)
In [25]:
df_final_log.columns
Out[25]:
Index(['BusinessTravel_Non-Travel', 'BusinessTravel_Travel_Frequently',
       'BusinessTravel_Travel_Rarely', 'Department_Human Resources',
       'Department_Research & Development', 'Department_Sales',
       'EducationField_Human Resources', 'EducationField_Life Sciences',
       'EducationField_Marketing', 'EducationField_Medical',
       'EducationField_Other', 'EducationField_Technical Degree',
       'Gender_Female', 'Gender_Male', 'JobRole_Healthcare Representative',
       'JobRole_Human Resources', 'JobRole_Laboratory Technician',
       'JobRole_Manager', 'JobRole_Manufacturing Director',
       'JobRole_Research Director', 'JobRole_Research Scientist',
       'JobRole_Sales Executive', 'JobRole_Sales Representative',
       'MaritalStatus_Divorced', 'MaritalStatus_Married',
       'MaritalStatus_Single', 'Over18_Y', 'OverTime_No', 'OverTime_Yes',
       'Attrition', 'Age', 'DailyRate', 'DistanceFromHome', 'Education',
       'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction',
       'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction',
       'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StandardHours', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')
In [ ]:
# Checking Linearity
In [26]:
noncorrelation = [column for column in df_final_log if abs(df_final_log[column].corr(df_final_log['Attrition']))<0.05]
corr_matrix = df_final_log.drop(noncorrelation,axis=1)
corr_matrix.head()
Out[26]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently Department_Research & Development Department_Sales EducationField_Marketing EducationField_Technical Degree JobRole_Healthcare Representative JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director ... JobLevel JobSatisfaction MonthlyIncome StandardHours StockOptionLevel TotalWorkingYears WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsWithCurrManager
0 0 0 0 1 0 0 0 0 0 0 ... -0.057788 1.153254 0.129018 0.0 -1.018674 -0.374906 -2.493820 0.041137 -0.018341 0.294570
1 0 1 1 0 0 0 0 0 0 0 ... -0.057788 -0.660853 -0.140791 0.0 0.510149 -0.057867 0.338096 1.069787 0.882230 0.888852
2 0 0 1 0 0 0 0 1 0 0 ... -0.961486 0.246200 -1.091220 0.0 -1.018674 -0.533426 0.338096 -1.501837 -1.219103 -1.191138
3 0 1 1 0 0 0 0 0 0 0 ... -0.961486 0.246200 -0.835167 0.0 -1.018674 -0.374906 0.338096 0.555462 0.882230 -1.191138
4 0 0 1 0 0 0 0 1 0 0 ... -0.961486 -0.660853 -0.660400 0.0 0.510149 -0.691946 0.338096 -0.987512 -0.618722 -0.596855

5 rows × 35 columns

In [ ]:
# Check Multicolinearity
In [27]:
log7 = corr_matrix.drop('Attrition',axis=1)
In [28]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_log = pd.DataFrame()
vif_log['Features'] =log7.columns
vif_log['VIF'] = [variance_inflation_factor(log7.values,i) for i in range(log7.shape[1])]
vif_log
Out[28]:
Features VIF
0 BusinessTravel_Non-Travel 1.046263
1 BusinessTravel_Travel_Frequently 1.042999
2 Department_Research & Development 6.451867
3 Department_Sales 6.756041
4 EducationField_Marketing 1.433720
5 EducationField_Technical Degree 1.030463
6 JobRole_Healthcare Representative 1.765789
7 JobRole_Laboratory Technician 1.517876
8 JobRole_Manager 2.566583
9 JobRole_Manufacturing Director 1.823432
10 JobRole_Research Director 2.516362
11 JobRole_Sales Representative 1.507687
12 MaritalStatus_Divorced inf
13 MaritalStatus_Married inf
14 MaritalStatus_Single inf
15 Over18_Y 0.000000
16 OverTime_No inf
17 OverTime_Yes inf
18 Age 1.607862
19 DailyRate 1.025251
20 DistanceFromHome 1.019110
21 EmployeeCount NaN
22 EnvironmentSatisfaction 1.028213
23 JobInvolvement 1.017833
24 JobLevel 6.005247
25 JobSatisfaction 1.015843
26 MonthlyIncome 2.618679
27 StandardHours NaN
28 StockOptionLevel 2.128814
29 TotalWorkingYears 2.575361
30 WorkLifeBalance 1.021160
31 YearsAtCompany 4.451725
32 YearsInCurrentRole 3.287478
33 YearsWithCurrManager 2.866381
In [29]:
featurestodrop = vif_log.loc[vif_log['VIF']>10]
droplist = featurestodrop['Features']
droplist = list(droplist)
len(droplist)
print(droplist)
['MaritalStatus_Divorced', 'MaritalStatus_Married', 'MaritalStatus_Single', 'OverTime_No', 'OverTime_Yes']
In [30]:
final_log=log7.drop(droplist,axis=1)
In [31]:
final_log
Out[31]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently Department_Research & Development Department_Sales EducationField_Marketing EducationField_Technical Degree JobRole_Healthcare Representative JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director ... JobLevel JobSatisfaction MonthlyIncome StandardHours StockOptionLevel TotalWorkingYears WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsWithCurrManager
0 0 0 0 1 0 0 0 0 0 0 ... -0.057788 1.153254 0.129018 0.0 -1.018674 -0.374906 -2.493820 0.041137 -0.018341 0.294570
1 0 1 1 0 0 0 0 0 0 0 ... -0.057788 -0.660853 -0.140791 0.0 0.510149 -0.057867 0.338096 1.069787 0.882230 0.888852
2 0 0 1 0 0 0 0 1 0 0 ... -0.961486 0.246200 -1.091220 0.0 -1.018674 -0.533426 0.338096 -1.501837 -1.219103 -1.191138
3 0 1 1 0 0 0 0 0 0 0 ... -0.961486 0.246200 -0.835167 0.0 -1.018674 -0.374906 0.338096 0.555462 0.882230 -1.191138
4 0 0 1 0 0 0 0 1 0 0 ... -0.961486 -0.660853 -0.660400 0.0 0.510149 -0.691946 0.338096 -0.987512 -0.618722 -0.596855
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 0 1 1 0 0 0 0 1 0 0 ... -0.057788 1.153254 -0.940839 0.0 0.510149 1.051772 0.338096 -0.216025 -0.618722 -0.299713
1466 0 0 1 0 0 0 1 0 0 0 ... 0.845911 -1.567907 1.378958 0.0 0.510149 -0.216386 0.338096 0.298300 0.882230 0.888852
1467 0 0 1 0 0 0 0 0 0 1 ... -0.057788 -0.660853 0.175602 0.0 0.510149 -0.691946 0.338096 0.041137 -0.618722 -0.299713
1468 0 1 0 1 0 0 0 0 0 0 ... -0.057788 -0.660853 -0.059504 0.0 -1.018674 1.051772 -1.077862 0.812625 0.582040 1.185994
1469 0 0 1 0 0 0 0 1 0 0 ... -0.057788 0.246200 -0.367768 0.0 -1.018674 -0.691946 1.754054 -0.473188 -0.318532 -0.596855

1470 rows × 29 columns

In [32]:
# Spliiting

xtrain,xtest,ytrain,ytest = train_test_split(final_log,y_log,test_size=0.25,random_state=31,stratify=y_log)
In [33]:
xtrain
Out[33]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently Department_Research & Development Department_Sales EducationField_Marketing EducationField_Technical Degree JobRole_Healthcare Representative JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director ... JobLevel JobSatisfaction MonthlyIncome StandardHours StockOptionLevel TotalWorkingYears WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsWithCurrManager
1069 0 0 1 0 0 0 0 0 0 0 ... -0.961486 0.246200 -1.255982 0.0 0.510149 -1.484545 -2.493820 -1.244675 -1.219103 -1.191138
841 0 0 1 0 0 0 0 1 0 0 ... -0.961486 -0.660853 -0.620070 0.0 -1.018674 -0.691946 0.338096 -0.473188 -0.318532 -0.596855
332 0 1 1 0 0 0 0 0 0 0 ... -0.057788 0.246200 -0.222390 0.0 -1.018674 1.527331 -1.077862 -0.473188 -0.318532 -0.299713
1337 0 0 0 1 0 0 0 0 0 0 ... -0.961486 -0.660853 -0.851737 0.0 0.510149 -1.484545 0.338096 -1.244675 -1.219103 -1.191138
292 0 1 0 1 1 0 0 0 0 0 ... -0.961486 -0.660853 -0.872684 0.0 0.510149 -1.326025 -1.077862 -0.987512 -0.618722 -0.596855
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
48 0 1 0 1 1 0 0 0 0 0 ... -0.057788 1.153254 0.059925 0.0 -1.018674 0.576212 0.338096 0.812625 0.582040 1.185994
220 0 0 1 0 0 0 0 1 0 0 ... -0.057788 -0.660853 0.104320 0.0 -1.018674 0.893252 1.754054 1.841275 2.082992 0.888852
551 0 0 0 0 0 0 0 0 0 0 ... -0.057788 -0.660853 0.252824 0.0 0.510149 0.259173 -2.493820 0.555462 -0.318532 0.591711
1435 0 0 1 0 0 0 0 0 0 0 ... -0.961486 1.153254 -0.983046 0.0 -1.018674 -0.691946 0.338096 -0.473188 -0.318532 -0.596855
1379 0 1 0 0 0 0 0 0 0 0 ... -0.961486 -0.660853 -0.849548 0.0 -1.018674 -1.484545 0.338096 -1.244675 -1.219103 -1.191138

1102 rows × 29 columns

In [34]:
xtest
Out[34]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently Department_Research & Development Department_Sales EducationField_Marketing EducationField_Technical Degree JobRole_Healthcare Representative JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director ... JobLevel JobSatisfaction MonthlyIncome StandardHours StockOptionLevel TotalWorkingYears WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsWithCurrManager
302 0 0 1 0 0 0 1 0 0 0 ... -0.057788 -1.567907 0.025222 0.0 -1.018674 -0.216386 0.338096 0.555462 -0.318532 0.888852
200 0 1 1 0 0 1 0 0 0 1 ... -0.057788 -1.567907 -0.400908 0.0 0.510149 -0.691946 0.338096 -0.987512 -0.618722 -1.191138
944 1 0 1 0 0 0 0 1 0 0 ... -0.057788 1.153254 0.341927 0.0 0.195024 -0.057867 0.338096 0.812625 1.182421 0.294570
929 0 1 1 0 0 0 0 1 0 0 ... -0.961486 1.153254 -0.535657 0.0 0.510149 -1.326025 0.338096 -0.987512 -0.618722 -0.596855
1047 0 1 0 1 0 0 0 0 0 0 ... -0.057788 -1.567907 -0.377460 0.0 2.038972 -0.850466 0.338096 -0.473188 -0.318532 -0.299713
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1310 0 1 1 0 0 0 0 0 0 0 ... 1.749610 0.246200 3.191026 0.0 -1.018674 2.002890 0.338096 -0.987512 -0.618722 -0.596855
766 0 0 1 0 0 0 0 0 0 0 ... 2.653309 0.246200 0.288444 0.0 0.510149 0.144974 -1.077862 0.555462 -0.918912 0.888852
1233 0 0 1 0 0 0 0 0 0 0 ... -0.961486 1.153254 -0.849861 0.0 0.510149 -0.057867 -1.077862 1.069787 -1.219103 1.185994
504 0 1 0 1 0 0 0 0 0 0 ... -0.057788 -1.567907 -0.404660 0.0 2.038972 -0.850466 0.338096 -1.244675 -0.918912 -1.191138
362 1 0 0 1 0 0 0 0 0 0 ... -0.961486 1.153254 -0.928646 0.0 -1.018674 -1.167505 -1.077862 -0.730350 -0.618722 -0.596855

368 rows × 29 columns

In [35]:
ytrain
Out[35]:
1069    0
841     0
332     0
1337    0
292     0
       ..
48      0
220     0
551     0
1435    0
1379    1
Name: Attrition, Length: 1102, dtype: int64
In [36]:
ytest
Out[36]:
302     0
200     0
944     0
929     0
1047    0
       ..
1310    0
766     0
1233    0
504     1
362     0
Name: Attrition, Length: 368, dtype: int64

a) Logistic Regression¶

In [ ]:
# Model Evalution
In [37]:
log_model = LogisticRegression()
log_model.fit(xtrain,ytrain)
Out[37]:
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [38]:
#  Testing data evaluation
ypredtest = log_model.predict(xtest)

Accuracy = accuracy_score(ytest,ypredtest)
print('Accuracy:',Accuracy)

confusionmatrix = confusion_matrix(ytest,ypredtest)
print('Confusion matrix: \n',confusionmatrix)

Precision = precision_score(ytest,ypredtest)
print('Precision:',Precision)

Recall = recall_score(ytest,ypredtest)
print('Recall:',Recall)

F1score = f1_score(ytest,ypredtest)
print('F1score:',F1score)

Classification_report = classification_report(ytest,ypredtest)
print('Classification report:\n',Classification_report)
Accuracy: 0.8586956521739131
Confusion matrix: 
 [[303   6]
 [ 46  13]]
Precision: 0.6842105263157895
Recall: 0.22033898305084745
F1score: 0.3333333333333333
Classification report:
               precision    recall  f1-score   support

           0       0.87      0.98      0.92       309
           1       0.68      0.22      0.33        59

    accuracy                           0.86       368
   macro avg       0.78      0.60      0.63       368
weighted avg       0.84      0.86      0.83       368

In [39]:
# Training data evaluation
ypredtrain = log_model.predict(xtrain)

Accuracy = accuracy_score(ytrain,ypredtrain)
print('Accuracy:',Accuracy)

confusionmatrix = confusion_matrix(ytrain,ypredtrain)
print('Confusion matrix: \n',confusionmatrix)

Precision = precision_score(ytrain,ypredtrain)
print('Precision:',Precision)

Recall = recall_score(ytrain,ypredtrain)
print('Recall:',Recall)

F1score = f1_score(ytrain,ypredtrain)
print('F1score:',F1score)

Classification_report = classification_report(ytrain,ypredtrain)
print('Classification report:\n',Classification_report)
Accuracy: 0.8693284936479129
Confusion matrix: 
 [[912  12]
 [132  46]]
Precision: 0.7931034482758621
Recall: 0.25842696629213485
F1score: 0.38983050847457634
Classification report:
               precision    recall  f1-score   support

           0       0.87      0.99      0.93       924
           1       0.79      0.26      0.39       178

    accuracy                           0.87      1102
   macro avg       0.83      0.62      0.66      1102
weighted avg       0.86      0.87      0.84      1102

b] KNN¶

In [40]:
Knn_model = KNeighborsClassifier()
Knn_model.fit(xtrain,ytrain)
Out[40]:
KNeighborsClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
In [41]:
# Testing data evaluation
ypredtest = Knn_model.predict(xtest)

Accuracy = accuracy_score(ytest,ypredtest)
print('Accuracy:',Accuracy)

Confusion_matrix = confusion_matrix(ytest,ypredtest)
print('Confusion_matrix:\n',Confusion_matrix)

Classification_report = classification_report(ytest,ypredtest)
print('Classification_report:\n',Classification_report)
Accuracy: 0.8396739130434783
Confusion_matrix:
 [[300   9]
 [ 50   9]]
Classification_report:
               precision    recall  f1-score   support

           0       0.86      0.97      0.91       309
           1       0.50      0.15      0.23        59

    accuracy                           0.84       368
   macro avg       0.68      0.56      0.57       368
weighted avg       0.80      0.84      0.80       368

In [42]:
# Training data evaluation
ypredtrain = Knn_model.predict(xtrain)

Accuracy = accuracy_score(ytrain,ypredtrain)
print('Accuracy:',Accuracy)

Confusion_matrix = confusion_matrix(ytrain,ypredtrain)
print('Confusion_matrix:\n',Confusion_matrix)

Classification_report = classification_report(ytrain,ypredtrain)
print('Classification_report:\n',Classification_report)
Accuracy: 0.8702359346642469
Confusion_matrix:
 [[911  13]
 [130  48]]
Classification_report:
               precision    recall  f1-score   support

           0       0.88      0.99      0.93       924
           1       0.79      0.27      0.40       178

    accuracy                           0.87      1102
   macro avg       0.83      0.63      0.66      1102
weighted avg       0.86      0.87      0.84      1102

c] Naive Bayes Algorithm¶

In [43]:
from sklearn.naive_bayes import BernoulliNB
In [44]:
naive_model = BernoulliNB(binarize=True)
naive_model.fit(xtrain,ytrain)
Out[44]:
BernoulliNB(binarize=True)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
BernoulliNB(binarize=True)
In [45]:
# Training data evaluation

ypredtrain = naive_model.predict(xtrain)

Accuracy = accuracy_score(ytrain,ypredtrain)
print('Accuracy:',Accuracy)

confusionmatrix = confusion_matrix(ytrain,ypredtrain)
print('Confusion matrix: \n',confusionmatrix)

Precision = precision_score(ytrain,ypredtrain)
print('Precision:',Precision)

Recall = recall_score(ytrain,ypredtrain)
print('Recall:',Recall)

F1score = f1_score(ytrain,ypredtrain)
print('F1score:',F1score)

Classification_report = classification_report(ytrain,ypredtrain)
print('Classification report:\n',Classification_report)
Accuracy: 0.838475499092559
Confusion matrix: 
 [[924   0]
 [178   0]]
Precision: 0.0
Recall: 0.0
F1score: 0.0
Classification report:
               precision    recall  f1-score   support

           0       0.84      1.00      0.91       924
           1       0.00      0.00      0.00       178

    accuracy                           0.84      1102
   macro avg       0.42      0.50      0.46      1102
weighted avg       0.70      0.84      0.76      1102

In [46]:
# Testing data evaluation

ypredtest = naive_model.predict(xtest)

Accuracy = accuracy_score(ytest,ypredtest)
print('Accuracy:',Accuracy)

confusionmatrix = confusion_matrix(ytest,ypredtest)
print('Confusion matrix: \n',confusionmatrix)

Precision = precision_score(ytest,ypredtest)
print('Precision:',Precision)

Recall = recall_score(ytest,ypredtest)
print('Recall:',Recall)

F1score = f1_score(ytest,ypredtest)
print('F1score:',F1score)

Classification_report = classification_report(ytest,ypredtest)
print('Classification report:\n',Classification_report)
Accuracy: 0.8396739130434783
Confusion matrix: 
 [[309   0]
 [ 59   0]]
Precision: 0.0
Recall: 0.0
F1score: 0.0
Classification report:
               precision    recall  f1-score   support

           0       0.84      1.00      0.91       309
           1       0.00      0.00      0.00        59

    accuracy                           0.84       368
   macro avg       0.42      0.50      0.46       368
weighted avg       0.71      0.84      0.77       368

d] Decision tree¶

In [47]:
# Model Training

dt_reg = DecisionTreeRegressor()
dt_reg.fit(xtrain, ytrain)
Out[47]:
DecisionTreeRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeRegressor()
In [48]:
# Training data evaluation

ypredtrain = dt_reg.predict(xtrain)

mse = mean_squared_error(ytrain, ypredtrain)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

r2 = r2_score(ytrain, ypredtrain)
print("R2 :",r2)
MSE : 0.0
RMSE : 0.0
R2 : 1.0
In [49]:
# Testing data Evaluation

ypred = dt_reg.predict(xtest)

mse = mean_squared_error(ytest, ypred)
print("MSE :",mse)

rmse = np.sqrt(mse)
print("RMSE :",rmse)

r2 = r2_score(ytest, ypred)
print("R2 :",r2)
MSE : 0.28532608695652173
RMSE : 0.5341592337089398
R2 : -1.119466842191871

e] SVM¶

In [50]:
# Model Training

svc_model = SVC()
svc_model.fit(xtrain,ytrain)
Out[50]:
SVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()
In [51]:
# Testing data evaluation
ypredtest = svc_model.predict(xtest)


Accuracy = accuracy_score(ytest,ypredtest)
print('Accuracy:',Accuracy)

Confusion_matrix = confusion_matrix(ytest,ypredtest)
print('Confusion_matrix: \n',Confusion_matrix)

Classification_report = classification_report(ytest,ypredtest)
print('Classification_report: \n',Classification_report)
Accuracy: 0.8478260869565217
Confusion_matrix: 
 [[307   2]
 [ 54   5]]
Classification_report: 
               precision    recall  f1-score   support

           0       0.85      0.99      0.92       309
           1       0.71      0.08      0.15        59

    accuracy                           0.85       368
   macro avg       0.78      0.54      0.53       368
weighted avg       0.83      0.85      0.79       368

In [52]:
# Training data evaluation
ypredtrain = svc_model.predict(xtrain)

Accuracy = accuracy_score(ytrain,ypredtrain)
print('Accuracy:',Accuracy)

Confusion_matrix = confusion_matrix(ytrain,ypredtrain)
print('Confusion_matrix: \n',Confusion_matrix)

Classification_report = classification_report(ytrain,ypredtrain)
print('Classification_report: \n',Classification_report)
Accuracy: 0.8720508166969148
Confusion_matrix: 
 [[921   3]
 [138  40]]
Classification_report: 
               precision    recall  f1-score   support

           0       0.87      1.00      0.93       924
           1       0.93      0.22      0.36       178

    accuracy                           0.87      1102
   macro avg       0.90      0.61      0.65      1102
weighted avg       0.88      0.87      0.84      1102

In [ ]:
# Hyperparameter tuning
In [53]:
svc_model = SVC()

hyp_grid = {'C':np.arange(0,50),
            'kernel' : ['linear', 'poly', 'rbf', 'sigmoid']}

gscv_scv_model = GridSearchCV(svc_model,hyp_grid,cv=5)

gscv_scv_model.fit(xtrain,ytrain)
Out[53]:
GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})
SVC()
SVC()
In [54]:
gscv_scv_model.best_estimator_
Out[54]:
SVC(C=1, kernel='poly')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(C=1, kernel='poly')
In [55]:
svc_model = SVC(C=1, kernel='linear')
svc_model.fit(xtrain,ytrain)
Out[55]:
SVC(C=1, kernel='linear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(C=1, kernel='linear')
In [56]:
# Testing data evaluation
ypredtest = svc_model.predict(xtest)


Accuracy = accuracy_score(ytest,ypredtest)
print('Accuracy:',Accuracy)

Confusion_matrix = confusion_matrix(ytest,ypredtest)
print('Confusion_matrix: \n',Confusion_matrix)

Classification_report = classification_report(ytest,ypredtest)
print('Classification_report: \n',Classification_report)
Accuracy: 0.8396739130434783
Confusion_matrix: 
 [[309   0]
 [ 59   0]]
Classification_report: 
               precision    recall  f1-score   support

           0       0.84      1.00      0.91       309
           1       0.00      0.00      0.00        59

    accuracy                           0.84       368
   macro avg       0.42      0.50      0.46       368
weighted avg       0.71      0.84      0.77       368

In [57]:
# Training data evaluation
ypredtrain = svc_model.predict(xtrain)

Accuracy = accuracy_score(ytrain,ypredtrain)
print('Accuracy:',Accuracy)

Confusion_matrix = confusion_matrix(ytrain,ypredtrain)
print('Confusion_matrix: \n',Confusion_matrix)

Classification_report = classification_report(ytrain,ypredtrain)
print('Classification_report: \n',Classification_report)
Accuracy: 0.838475499092559
Confusion_matrix: 
 [[924   0]
 [178   0]]
Classification_report: 
               precision    recall  f1-score   support

           0       0.84      1.00      0.91       924
           1       0.00      0.00      0.00       178

    accuracy                           0.84      1102
   macro avg       0.42      0.50      0.46      1102
weighted avg       0.70      0.84      0.76      1102

f] ANN¶

In [ ]:
# Model Building
In [58]:
import tensorflow as tf
import keras
In [59]:
from keras import Sequential
from keras.layers import Dense,Dropout
In [60]:
nn = Sequential()
 # Create input layer and 1st Hidden layer
nn.add(Dense(units = 6,activation='relu',kernel_initializer='he_uniform',input_dim=(29)))  

# Add 1st Dropout layer
nn.add(Dropout(rate=0.5))

# Add 2nd Hidden layer
nn.add(Dense(units = 6,activation='relu',kernel_initializer='he_uniform')) 

# Add 2nd Dropout layer
nn.add(Dropout(rate=0.5))

# Add output layer
nn.add(Dense(units=1 ,activation='sigmoid', kernel_initializer='glorot_uniform'))
In [61]:
# Compile
nn.compile(optimizer='Adam',loss='binary_crossentropy' ,metrics=['accuracy','Recall','Precision'])
In [62]:
# Training
model_train= nn.fit(xtrain,ytrain,validation_split=0.2,epochs=100,batch_size=20)
Epoch 1/100
45/45 [==============================] - 2s 10ms/step - loss: 0.7820 - accuracy: 0.7242 - recall: 0.1942 - precision: 0.1709 - val_loss: 0.6188 - val_accuracy: 0.7602 - val_recall: 0.2821 - val_precision: 0.3056
Epoch 2/100
45/45 [==============================] - 0s 3ms/step - loss: 0.6897 - accuracy: 0.7582 - recall: 0.1655 - precision: 0.1917 - val_loss: 0.5879 - val_accuracy: 0.8416 - val_recall: 0.2051 - val_precision: 0.6667
Epoch 3/100
45/45 [==============================] - 0s 3ms/step - loss: 0.6642 - accuracy: 0.7696 - recall: 0.1295 - precision: 0.1800 - val_loss: 0.5646 - val_accuracy: 0.8235 - val_recall: 0.0256 - val_precision: 0.5000
Epoch 4/100
45/45 [==============================] - 0s 3ms/step - loss: 0.6437 - accuracy: 0.7877 - recall: 0.1511 - precision: 0.2333 - val_loss: 0.5488 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 5/100
45/45 [==============================] - 0s 3ms/step - loss: 0.5764 - accuracy: 0.8036 - recall: 0.1007 - precision: 0.2258 - val_loss: 0.5312 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 6/100
45/45 [==============================] - 0s 3ms/step - loss: 0.5815 - accuracy: 0.8184 - recall: 0.1007 - precision: 0.2857 - val_loss: 0.5173 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 7/100
45/45 [==============================] - 0s 3ms/step - loss: 0.5608 - accuracy: 0.8150 - recall: 0.0719 - precision: 0.2273 - val_loss: 0.5092 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 8/100
45/45 [==============================] - 0s 5ms/step - loss: 0.5603 - accuracy: 0.8070 - recall: 0.0432 - precision: 0.1395 - val_loss: 0.5015 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 9/100
45/45 [==============================] - 0s 4ms/step - loss: 0.5463 - accuracy: 0.8297 - recall: 0.0288 - precision: 0.2105 - val_loss: 0.4936 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 10/100
45/45 [==============================] - 0s 4ms/step - loss: 0.5165 - accuracy: 0.8343 - recall: 0.0576 - precision: 0.3478 - val_loss: 0.4856 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 11/100
45/45 [==============================] - 0s 4ms/step - loss: 0.5143 - accuracy: 0.8365 - recall: 0.0360 - precision: 0.3333 - val_loss: 0.4780 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 12/100
45/45 [==============================] - 0s 4ms/step - loss: 0.5000 - accuracy: 0.8388 - recall: 0.0288 - precision: 0.3636 - val_loss: 0.4719 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 13/100
45/45 [==============================] - 0s 5ms/step - loss: 0.4635 - accuracy: 0.8388 - recall: 0.0144 - precision: 0.2857 - val_loss: 0.4664 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 14/100
45/45 [==============================] - 0s 4ms/step - loss: 0.4849 - accuracy: 0.8331 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4618 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 15/100
45/45 [==============================] - 0s 4ms/step - loss: 0.4845 - accuracy: 0.8343 - recall: 0.0072 - precision: 0.1111 - val_loss: 0.4566 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 16/100
45/45 [==============================] - 0s 4ms/step - loss: 0.4765 - accuracy: 0.8388 - recall: 0.0144 - precision: 0.2857 - val_loss: 0.4528 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 17/100
45/45 [==============================] - 0s 4ms/step - loss: 0.4894 - accuracy: 0.8365 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4501 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 18/100
45/45 [==============================] - 0s 5ms/step - loss: 0.4667 - accuracy: 0.8365 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4474 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 19/100
45/45 [==============================] - 0s 4ms/step - loss: 0.4559 - accuracy: 0.8400 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4430 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 20/100
45/45 [==============================] - 0s 4ms/step - loss: 0.4641 - accuracy: 0.8411 - recall: 0.0072 - precision: 0.3333 - val_loss: 0.4423 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 21/100
45/45 [==============================] - 0s 5ms/step - loss: 0.4657 - accuracy: 0.8422 - recall: 0.0072 - precision: 0.5000 - val_loss: 0.4409 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 22/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4454 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4398 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 23/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4594 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4381 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 24/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4510 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4368 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 25/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4536 - accuracy: 0.8434 - recall: 0.0072 - precision: 1.0000 - val_loss: 0.4362 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 26/100
45/45 [==============================] - 0s 4ms/step - loss: 0.4648 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4358 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 27/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4404 - accuracy: 0.8388 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4349 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 28/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4421 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4347 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 29/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4382 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4339 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 30/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4371 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4335 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 31/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4403 - accuracy: 0.8422 - recall: 0.0072 - precision: 0.5000 - val_loss: 0.4344 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 32/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4349 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4335 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 33/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4197 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4315 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 34/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4419 - accuracy: 0.8400 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4314 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 35/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4475 - accuracy: 0.8422 - recall: 0.0072 - precision: 0.5000 - val_loss: 0.4328 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 36/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4359 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4329 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 37/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4391 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4339 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 38/100
45/45 [==============================] - 0s 4ms/step - loss: 0.4329 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4330 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 39/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4217 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4327 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 40/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4311 - accuracy: 0.8434 - recall: 0.0072 - precision: 1.0000 - val_loss: 0.4334 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 41/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4287 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4323 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 42/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4163 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4312 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 43/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4315 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4312 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 44/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4261 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4308 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 45/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4284 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4306 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 46/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4213 - accuracy: 0.8400 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4310 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 47/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4313 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4321 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 48/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4386 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4323 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 49/100
45/45 [==============================] - 0s 4ms/step - loss: 0.4193 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4332 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 50/100
45/45 [==============================] - 0s 4ms/step - loss: 0.4189 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4317 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 51/100
45/45 [==============================] - 0s 4ms/step - loss: 0.4091 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4324 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 52/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4156 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4322 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 53/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4353 - accuracy: 0.8411 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4330 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 54/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4202 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4346 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 55/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4289 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4356 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 56/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4106 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4363 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 57/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4206 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4371 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 58/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4136 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4371 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 59/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4102 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4371 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 60/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4167 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4374 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 61/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4187 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4380 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 62/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4080 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4387 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 63/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4085 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4401 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 64/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4196 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4384 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 65/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4223 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4379 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 66/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4198 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4371 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 67/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4205 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4380 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 68/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4081 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4371 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 69/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4197 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4375 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 70/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4095 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4383 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 71/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4032 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4383 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 72/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4121 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4374 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 73/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4096 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4377 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 74/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4055 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4402 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 75/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4002 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4416 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 76/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4028 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4400 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 77/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4061 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4389 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 78/100
45/45 [==============================] - 0s 3ms/step - loss: 0.3982 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4400 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 79/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4066 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4419 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 80/100
45/45 [==============================] - 0s 4ms/step - loss: 0.4022 - accuracy: 0.8422 - recall: 0.0000e+00 - precision: 0.0000e+00 - val_loss: 0.4416 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 81/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4022 - accuracy: 0.8434 - recall: 0.0072 - precision: 1.0000 - val_loss: 0.4401 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 82/100
45/45 [==============================] - 0s 3ms/step - loss: 0.3980 - accuracy: 0.8445 - recall: 0.0144 - precision: 1.0000 - val_loss: 0.4398 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 83/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4137 - accuracy: 0.8434 - recall: 0.0072 - precision: 1.0000 - val_loss: 0.4415 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 84/100
45/45 [==============================] - 0s 4ms/step - loss: 0.4042 - accuracy: 0.8411 - recall: 0.0072 - precision: 0.3333 - val_loss: 0.4415 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 85/100
45/45 [==============================] - 0s 4ms/step - loss: 0.3928 - accuracy: 0.8434 - recall: 0.0072 - precision: 1.0000 - val_loss: 0.4411 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 86/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4048 - accuracy: 0.8422 - recall: 0.0072 - precision: 0.5000 - val_loss: 0.4424 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 87/100
45/45 [==============================] - 0s 3ms/step - loss: 0.4015 - accuracy: 0.8400 - recall: 0.0072 - precision: 0.2500 - val_loss: 0.4437 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 88/100
45/45 [==============================] - 0s 4ms/step - loss: 0.3983 - accuracy: 0.8445 - recall: 0.0144 - precision: 1.0000 - val_loss: 0.4448 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 89/100
45/45 [==============================] - 0s 5ms/step - loss: 0.3965 - accuracy: 0.8502 - recall: 0.0504 - precision: 1.0000 - val_loss: 0.4444 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 90/100
45/45 [==============================] - 0s 5ms/step - loss: 0.3985 - accuracy: 0.8479 - recall: 0.0576 - precision: 0.7273 - val_loss: 0.4441 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 91/100
45/45 [==============================] - 0s 4ms/step - loss: 0.3975 - accuracy: 0.8445 - recall: 0.0432 - precision: 0.6000 - val_loss: 0.4457 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 92/100
45/45 [==============================] - 0s 4ms/step - loss: 0.3998 - accuracy: 0.8445 - recall: 0.0360 - precision: 0.6250 - val_loss: 0.4469 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 93/100
45/45 [==============================] - 0s 4ms/step - loss: 0.3867 - accuracy: 0.8468 - recall: 0.0288 - precision: 1.0000 - val_loss: 0.4471 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 94/100
45/45 [==============================] - 0s 5ms/step - loss: 0.3866 - accuracy: 0.8468 - recall: 0.0432 - precision: 0.7500 - val_loss: 0.4496 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 95/100
45/45 [==============================] - 0s 4ms/step - loss: 0.3983 - accuracy: 0.8468 - recall: 0.0432 - precision: 0.7500 - val_loss: 0.4508 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 96/100
45/45 [==============================] - 0s 5ms/step - loss: 0.3904 - accuracy: 0.8422 - recall: 0.0288 - precision: 0.5000 - val_loss: 0.4485 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 97/100
45/45 [==============================] - 0s 4ms/step - loss: 0.4032 - accuracy: 0.8411 - recall: 0.0360 - precision: 0.4545 - val_loss: 0.4491 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 98/100
45/45 [==============================] - 0s 5ms/step - loss: 0.3996 - accuracy: 0.8422 - recall: 0.0288 - precision: 0.5000 - val_loss: 0.4503 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 99/100
45/45 [==============================] - 0s 5ms/step - loss: 0.3903 - accuracy: 0.8456 - recall: 0.0288 - precision: 0.8000 - val_loss: 0.4498 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
Epoch 100/100
45/45 [==============================] - 0s 5ms/step - loss: 0.3906 - accuracy: 0.8434 - recall: 0.0360 - precision: 0.5556 - val_loss: 0.4499 - val_accuracy: 0.8235 - val_recall: 0.0000e+00 - val_precision: 0.0000e+00
In [ ]:
# Model Testing
In [63]:
ypredtest = nn.predict(xtest)
12/12 [==============================] - 0s 1ms/step
In [64]:
ypredtest
Out[64]:
array([[0.01510506],
       [0.20445046],
       [0.02313249],
       [0.2447614 ],
       [0.11240595],
       [0.25176114],
       [0.17864135],
       [0.01808105],
       [0.15272936],
       [0.27237415],
       [0.26029631],
       [0.0557157 ],
       [0.16247971],
       [0.10556228],
       [0.12715463],
       [0.05700505],
       [0.20292723],
       [0.09968841],
       [0.15977632],
       [0.17864135],
       [0.2798506 ],
       [0.17864135],
       [0.17864135],
       [0.30207202],
       [0.06405034],
       [0.17864135],
       [0.06013674],
       [0.16053565],
       [0.17589574],
       [0.21343994],
       [0.0364303 ],
       [0.04026547],
       [0.12827843],
       [0.17406179],
       [0.05245738],
       [0.00125493],
       [0.15511252],
       [0.15144907],
       [0.23954777],
       [0.14012903],
       [0.15522859],
       [0.2233578 ],
       [0.16509353],
       [0.26715297],
       [0.07053942],
       [0.24002005],
       [0.14947313],
       [0.22769532],
       [0.1019326 ],
       [0.2713851 ],
       [0.1695767 ],
       [0.1945976 ],
       [0.13116962],
       [0.17864135],
       [0.00316014],
       [0.0790848 ],
       [0.17864135],
       [0.00331617],
       [0.20549418],
       [0.08482029],
       [0.05072754],
       [0.06374671],
       [0.0651449 ],
       [0.2939182 ],
       [0.08386078],
       [0.12554254],
       [0.19114166],
       [0.14072482],
       [0.26029631],
       [0.01196847],
       [0.0507173 ],
       [0.04790408],
       [0.31563863],
       [0.29325575],
       [0.06633819],
       [0.26757872],
       [0.17632928],
       [0.07789939],
       [0.18047896],
       [0.0004058 ],
       [0.17864135],
       [0.27902952],
       [0.00927696],
       [0.17864135],
       [0.01344493],
       [0.3010726 ],
       [0.22933711],
       [0.17232682],
       [0.26729128],
       [0.11951035],
       [0.23603752],
       [0.26029631],
       [0.17864135],
       [0.15779273],
       [0.01229363],
       [0.03191255],
       [0.0187415 ],
       [0.15875162],
       [0.21060082],
       [0.05792176],
       [0.0768427 ],
       [0.27470306],
       [0.00513243],
       [0.07764331],
       [0.2619322 ],
       [0.08740467],
       [0.10199311],
       [0.21328129],
       [0.12665364],
       [0.04587372],
       [0.06382611],
       [0.02262612],
       [0.08895383],
       [0.23007524],
       [0.11267168],
       [0.14078857],
       [0.25319505],
       [0.26330686],
       [0.08358296],
       [0.08433616],
       [0.10481097],
       [0.17864135],
       [0.17864135],
       [0.1491308 ],
       [0.14289218],
       [0.09642561],
       [0.08585224],
       [0.17210375],
       [0.17864135],
       [0.05308583],
       [0.17864135],
       [0.25155568],
       [0.01316426],
       [0.03361205],
       [0.1646049 ],
       [0.17024504],
       [0.24575444],
       [0.17864135],
       [0.17940202],
       [0.14060722],
       [0.17864135],
       [0.08208651],
       [0.17864135],
       [0.16486545],
       [0.19501169],
       [0.17864135],
       [0.17864135],
       [0.02165459],
       [0.17864135],
       [0.15567924],
       [0.1198564 ],
       [0.125373  ],
       [0.26029631],
       [0.15322718],
       [0.04668615],
       [0.1608112 ],
       [0.02872171],
       [0.06027906],
       [0.18825325],
       [0.2448634 ],
       [0.17864135],
       [0.07112506],
       [0.15243876],
       [0.09913176],
       [0.17864135],
       [0.09134295],
       [0.27147523],
       [0.11566301],
       [0.17864135],
       [0.17864135],
       [0.2664947 ],
       [0.08195227],
       [0.11174984],
       [0.01494817],
       [0.20882936],
       [0.15095532],
       [0.00298889],
       [0.18600288],
       [0.08491654],
       [0.20261735],
       [0.14526552],
       [0.17864135],
       [0.21982045],
       [0.2440781 ],
       [0.26029631],
       [0.18553853],
       [0.21339819],
       [0.28174663],
       [0.23102461],
       [0.26029631],
       [0.22908273],
       [0.26029631],
       [0.19858061],
       [0.1377052 ],
       [0.03284998],
       [0.26029631],
       [0.1392803 ],
       [0.17864135],
       [0.20481087],
       [0.17864135],
       [0.27247092],
       [0.24391419],
       [0.01871445],
       [0.13344584],
       [0.09237079],
       [0.00121117],
       [0.08360118],
       [0.00102865],
       [0.16539492],
       [0.27777648],
       [0.25430283],
       [0.0637172 ],
       [0.17864135],
       [0.03634345],
       [0.26332825],
       [0.27286452],
       [0.30843747],
       [0.08077957],
       [0.26658297],
       [0.17102545],
       [0.15640703],
       [0.17864135],
       [0.13594404],
       [0.05219484],
       [0.00595472],
       [0.0783316 ],
       [0.17864135],
       [0.2593516 ],
       [0.16748501],
       [0.06541692],
       [0.23679696],
       [0.00642795],
       [0.2756765 ],
       [0.0244295 ],
       [0.03046309],
       [0.17864135],
       [0.06599958],
       [0.1735468 ],
       [0.26029631],
       [0.01298711],
       [0.17864135],
       [0.14433551],
       [0.22224829],
       [0.26916125],
       [0.01386248],
       [0.0545248 ],
       [0.10220438],
       [0.28250805],
       [0.1060359 ],
       [0.17695737],
       [0.10343977],
       [0.17864135],
       [0.05588656],
       [0.2539565 ],
       [0.03529732],
       [0.02105915],
       [0.1310736 ],
       [0.16897002],
       [0.17864135],
       [0.16364472],
       [0.08125693],
       [0.16763338],
       [0.17864135],
       [0.15711759],
       [0.19832133],
       [0.08390985],
       [0.23529173],
       [0.03127239],
       [0.17864135],
       [0.26029631],
       [0.22231479],
       [0.17864135],
       [0.21266054],
       [0.23025179],
       [0.18405075],
       [0.16127308],
       [0.16364063],
       [0.13972007],
       [0.01925687],
       [0.27160215],
       [0.30919692],
       [0.00588664],
       [0.01169947],
       [0.09484629],
       [0.26029631],
       [0.17864135],
       [0.14511535],
       [0.18220787],
       [0.0388039 ],
       [0.17864135],
       [0.09844014],
       [0.04057379],
       [0.1337107 ],
       [0.0125258 ],
       [0.16890198],
       [0.29679382],
       [0.09167103],
       [0.02465587],
       [0.17864135],
       [0.05458304],
       [0.08702319],
       [0.03677688],
       [0.17675959],
       [0.00092281],
       [0.0072814 ],
       [0.14503984],
       [0.10235583],
       [0.17864135],
       [0.01230869],
       [0.23082604],
       [0.00960664],
       [0.11529443],
       [0.03483866],
       [0.15413877],
       [0.07259345],
       [0.17864135],
       [0.17864135],
       [0.14462319],
       [0.17057335],
       [0.17864135],
       [0.15546933],
       [0.14479683],
       [0.17864135],
       [0.15372086],
       [0.06519894],
       [0.17864135],
       [0.0192605 ],
       [0.04998954],
       [0.09350831],
       [0.120751  ],
       [0.17354558],
       [0.07330868],
       [0.07280812],
       [0.17864135],
       [0.09785005],
       [0.17864135],
       [0.10088717],
       [0.05949574],
       [0.15908647],
       [0.1254853 ],
       [0.14708665],
       [0.0614772 ],
       [0.13817297],
       [0.02034603],
       [0.1802322 ],
       [0.2787524 ],
       [0.29827794],
       [0.16202964],
       [0.03965331],
       [0.24010457],
       [0.07203472],
       [0.01714903],
       [0.11134897],
       [0.27343938],
       [0.00378489],
       [0.03697024],
       [0.02776049],
       [0.06400913],
       [0.19636442],
       [0.14231376],
       [0.12078112],
       [0.1963701 ],
       [0.02792522],
       [0.17864135],
       [0.0022663 ],
       [0.23789994],
       [0.17864135],
       [0.24822074]], dtype=float32)
In [65]:
ypredtest = list(map(int,(ypredtest>0.4)))
In [66]:
print(ypredtest)
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
In [67]:
Accuracy = accuracy_score(ytest,ypredtest)
Accuracy
Out[67]:
0.8396739130434783
In [68]:
final_y = pd.DataFrame(ypredtest)
In [69]:
final_pred = final_y.replace({1:'Y',0:'N'})
final_pred
Out[69]:
0
0 N
1 N
2 N
3 N
4 N
... ...
363 N
364 N
365 N
366 N
367 N

368 rows × 1 columns

In [ ]:
#  choose prediction error and prediction of best model is logistic Model 
#  accuracy = 0.8586

Final Prediction¶

In [70]:
ypredtest = log_model.predict(xtest)
In [71]:
ypredtest
Out[71]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
In [72]:
Final_pred = pd.DataFrame(ypredtest)
In [73]:
Final_pred
Out[73]:
0
0 0
1 0
2 0
3 0
4 0
... ...
363 0
364 0
365 0
366 0
367 0

368 rows × 1 columns

Problem Statements¶

In [ ]:
# 9. Clustering - Find intersting clusters using K-Means, Heirarchical and DBSCAN clustering.
# Connect to Domain scenario and its usefulness in analysis (Ignore Attrition column)

a] Grouping of Employees¶

In [74]:
final_log.head()
Out[74]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently Department_Research & Development Department_Sales EducationField_Marketing EducationField_Technical Degree JobRole_Healthcare Representative JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director ... JobLevel JobSatisfaction MonthlyIncome StandardHours StockOptionLevel TotalWorkingYears WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsWithCurrManager
0 0 0 0 1 0 0 0 0 0 0 ... -0.057788 1.153254 0.129018 0.0 -1.018674 -0.374906 -2.493820 0.041137 -0.018341 0.294570
1 0 1 1 0 0 0 0 0 0 0 ... -0.057788 -0.660853 -0.140791 0.0 0.510149 -0.057867 0.338096 1.069787 0.882230 0.888852
2 0 0 1 0 0 0 0 1 0 0 ... -0.961486 0.246200 -1.091220 0.0 -1.018674 -0.533426 0.338096 -1.501837 -1.219103 -1.191138
3 0 1 1 0 0 0 0 0 0 0 ... -0.961486 0.246200 -0.835167 0.0 -1.018674 -0.374906 0.338096 0.555462 0.882230 -1.191138
4 0 0 1 0 0 0 0 1 0 0 ... -0.961486 -0.660853 -0.660400 0.0 0.510149 -0.691946 0.338096 -0.987512 -0.618722 -0.596855

5 rows × 29 columns

1. kmeans Clustering¶

In [ ]:
# Model Building
In [75]:
kmeans_model = KMeans(n_clusters=5)  # n_clusters=8 by default
kmeans_model.fit(x_log)
Out[75]:
KMeans(n_clusters=5)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=5)
In [ ]:
# With cluster some of squares
In [76]:
kmeans_model.inertia_
Out[76]:
28488.260625226452
In [ ]:
# Labeling Variable
In [77]:
C_var = kmeans_model.fit_predict(x_log)
C_var
Out[77]:
array([3, 4, 3, ..., 4, 2, 0], dtype=int32)
In [ ]:
# Cluster = 0
In [78]:
index_0 = np.where(C_var==0)
index_0
Out[78]:
(array([   3,    4,   10,   12,   13,   16,   17,   19,   20,   23,   30,
          31,   32,   35,   37,   38,   39,   40,   41,   49,   53,   54,
          57,   66,   68,   72,   74,   81,   84,   86,   97,   99,  101,
         104,  108,  109,  113,  114,  115,  125,  128,  130,  138,  141,
         143,  145,  146,  149,  156,  159,  161,  162,  164,  170,  174,
         175,  176,  179,  181,  183,  184,  191,  193,  196,  197,  198,
         200,  202,  203,  206,  207,  208,  224,  225,  230,  232,  238,
         240,  241,  242,  246,  248,  249,  252,  253,  254,  255,  258,
         262,  265,  267,  273,  274,  284,  287,  292,  294,  298,  299,
         301,  302,  309,  310,  312,  318,  320,  328,  330,  331,  333,
         337,  340,  345,  346,  347,  349,  350,  351,  354,  358,  365,
         369,  371,  372,  373,  374,  377,  380,  381,  383,  387,  388,
         389,  393,  396,  397,  399,  402,  404,  409,  413,  419,  428,
         430,  431,  437,  438,  441,  449,  450,  454,  460,  461,  470,
         471,  474,  478,  481,  483,  485,  486,  487,  488,  490,  493,
         494,  496,  499,  500,  505,  507,  511,  512,  515,  516,  520,
         522,  539,  542,  543,  546,  548,  549,  550,  555,  556,  557,
         559,  560,  565,  570,  571,  572,  574,  575,  576,  577,  579,
         580,  583,  586,  596,  597,  599,  601,  603,  605,  606,  613,
         615,  617,  618,  620,  622,  623,  626,  628,  629,  632,  634,
         637,  638,  639,  640,  642,  643,  644,  648,  650,  654,  655,
         657,  659,  665,  668,  670,  671,  672,  673,  678,  679,  680,
         684,  691,  694,  697,  698,  703,  712,  713,  715,  717,  719,
         722,  724,  726,  727,  734,  735,  737,  739,  742,  747,  754,
         759,  763,  765,  767,  769,  772,  781,  782,  786,  790,  793,
         794,  795,  802,  803,  811,  815,  816,  818,  819,  820,  822,
         823,  824,  826,  827,  830,  832,  833,  835,  839,  840,  845,
         848,  850,  854,  856,  859,  862,  863,  865,  866,  868,  876,
         877,  878,  884,  885,  893,  895,  901,  902,  903,  906,  909,
         917,  921,  924,  925,  929,  931,  933,  934,  938,  957,  961,
         965,  970,  972,  973,  982,  984,  986,  988,  989,  990,  991,
         993,  996,  998, 1000, 1001, 1002, 1003, 1004, 1011, 1013, 1015,
        1017, 1019, 1020, 1022, 1025, 1026, 1027, 1028, 1035, 1037, 1038,
        1041, 1042, 1045, 1046, 1047, 1049, 1051, 1052, 1059, 1061, 1065,
        1066, 1067, 1069, 1070, 1072, 1074, 1079, 1082, 1088, 1091, 1097,
        1098, 1101, 1104, 1105, 1107, 1108, 1109, 1113, 1115, 1117, 1120,
        1121, 1123, 1125, 1127, 1128, 1132, 1133, 1134, 1137, 1139, 1141,
        1144, 1145, 1152, 1158, 1161, 1168, 1169, 1170, 1172, 1173, 1178,
        1180, 1182, 1189, 1191, 1192, 1197, 1199, 1200, 1202, 1207, 1211,
        1215, 1217, 1219, 1224, 1226, 1227, 1228, 1229, 1230, 1233, 1234,
        1238, 1241, 1245, 1247, 1248, 1250, 1252, 1254, 1256, 1258, 1259,
        1261, 1270, 1272, 1276, 1283, 1285, 1286, 1287, 1292, 1293, 1294,
        1299, 1302, 1306, 1309, 1311, 1317, 1319, 1321, 1323, 1324, 1325,
        1329, 1335, 1337, 1342, 1343, 1345, 1349, 1352, 1355, 1358, 1359,
        1360, 1362, 1366, 1367, 1371, 1376, 1378, 1380, 1381, 1382, 1387,
        1388, 1391, 1397, 1400, 1402, 1406, 1407, 1408, 1411, 1413, 1415,
        1417, 1419, 1422, 1423, 1426, 1427, 1428, 1435, 1436, 1440, 1448,
        1449, 1453, 1454, 1455, 1456, 1457, 1458, 1459, 1460, 1464, 1465,
        1469]),)
In [79]:
cluster_0 = final_log.iloc[index_0]
cluster_0
Out[79]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently Department_Research & Development Department_Sales EducationField_Marketing EducationField_Technical Degree JobRole_Healthcare Representative JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director ... JobLevel JobSatisfaction MonthlyIncome StandardHours StockOptionLevel TotalWorkingYears WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsWithCurrManager
3 0 1 1 0 0 0 0 0 0 0 ... -0.961486 0.246200 -0.835167 0.0 -1.018674 -0.374906 0.338096 0.555462 0.882230 -1.191138
4 0 0 1 0 0 0 0 1 0 0 ... -0.961486 -0.660853 -0.660400 0.0 0.510149 -0.691946 0.338096 -0.987512 -0.618722 -0.596855
10 0 0 1 0 0 0 0 1 0 0 ... -0.961486 -0.660853 -0.986172 0.0 0.510149 -0.691946 0.338096 -0.216025 -0.018341 -0.299713
12 0 0 1 0 0 0 0 0 0 0 ... -0.961486 0.246200 -0.834541 0.0 0.510149 -0.850466 -1.077862 -0.216025 -0.618722 -0.299713
13 0 0 1 0 0 0 0 1 0 0 ... -0.961486 1.153254 -0.912702 0.0 0.510149 -1.167505 0.338096 -0.987512 -0.618722 -0.596855
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1459 0 0 1 0 0 0 0 1 0 0 ... -0.057788 -0.660853 -0.486259 0.0 0.510149 -0.057867 0.338096 -0.473188 -0.318532 -0.299713
1460 0 0 1 0 0 0 0 0 0 0 ... -0.961486 -1.567907 -0.561293 0.0 -1.018674 -0.850466 -2.493820 -0.216025 -0.018341 -0.002572
1464 0 0 0 1 0 0 0 0 0 0 ... -0.961486 0.246200 -0.817346 0.0 -1.018674 -0.850466 0.338096 -0.473188 -0.618722 -1.191138
1465 0 1 1 0 0 0 0 1 0 0 ... -0.057788 1.153254 -0.940839 0.0 0.510149 1.051772 0.338096 -0.216025 -0.618722 -0.299713
1469 0 0 1 0 0 0 0 1 0 0 ... -0.057788 0.246200 -0.367768 0.0 -1.018674 -0.691946 1.754054 -0.473188 -0.318532 -0.596855

529 rows × 29 columns

In [ ]:
# cluster=1
In [80]:
index_1 = np.where(C_var==1)
index_1
Out[80]:
(array([  18,   25,   28,   29,   45,   62,   63,   65,   67,   70,   77,
          78,   82,   85,   90,   93,   95,   98,  106,  112,  123,  126,
         131,  133,  136,  147,  163,  165,  186,  187,  189,  190,  209,
         213,  215,  218,  219,  231,  233,  235,  237,  244,  245,  257,
         263,  268,  270,  271,  275,  276,  279,  280,  290,  295,  300,
         307,  308,  313,  314,  316,  326,  329,  332,  348,  360,  367,
         376,  379,  390,  392,  400,  401,  406,  408,  411,  417,  420,
         424,  425,  427,  429,  433,  435,  448,  455,  458,  464,  465,
         466,  473,  477,  489,  492,  497,  510,  534,  535,  536,  538,
         541,  544,  552,  561,  568,  584,  588,  592,  595,  616,  624,
         625,  646,  649,  653,  674,  677,  695,  699,  701,  706,  716,
         721,  723,  728,  736,  738,  741,  743,  750,  753,  755,  758,
         766,  770,  771,  774,  775,  787,  789,  799,  806,  810,  812,
         813,  814,  821,  838,  851,  858,  869,  887,  890,  894,  897,
         898,  899,  904,  905,  907,  913,  914,  916,  919,  922,  926,
         937,  945,  954,  955,  956,  962,  966,  971,  975,  976,  987,
         994,  999, 1008, 1010, 1014, 1024, 1031, 1034, 1043, 1044, 1054,
        1062, 1076, 1078, 1080, 1086, 1093, 1096, 1111, 1126, 1135, 1138,
        1140, 1154, 1164, 1176, 1177, 1194, 1195, 1196, 1203, 1223, 1225,
        1235, 1242, 1264, 1268, 1275, 1277, 1301, 1303, 1305, 1310, 1327,
        1331, 1348, 1351, 1370, 1374, 1377, 1396, 1401, 1403, 1437, 1443,
        1461, 1462]),)
In [81]:
cluster_1 = final_log.iloc[index_1]
cluster_1
Out[81]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently Department_Research & Development Department_Sales EducationField_Marketing EducationField_Technical Degree JobRole_Healthcare Representative JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director ... JobLevel JobSatisfaction MonthlyIncome StandardHours StockOptionLevel TotalWorkingYears WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsWithCurrManager
18 0 0 0 1 0 0 0 0 1 0 ... 1.749610 1.153254 3.078475 0.0 -1.018674 0.144974 0.338096 0.300399 1.182421 0.888852
25 0 0 1 0 0 0 0 0 1 0 ... 2.653309 0.246200 0.288444 0.0 0.510149 2.478450 -1.077862 2.098437 2.683373 1.185994
28 0 0 1 0 0 0 1 0 0 0 ... 0.845911 1.153254 1.459306 0.0 0.510149 2.161410 0.338096 0.300399 0.582040 0.034015
29 0 0 0 1 1 0 0 0 1 0 ... 2.653309 -1.567907 0.288444 0.0 -1.018674 1.844371 -1.077862 -0.987512 -0.618722 -0.893996
45 0 0 1 0 0 1 0 0 0 0 ... 2.653309 0.246200 0.288444 0.0 -1.018674 2.002890 0.338096 0.300399 0.050478 1.185994
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1403 0 0 0 1 1 0 0 0 0 0 ... 1.749610 -1.567907 2.426305 0.0 -1.018674 1.685851 0.338096 0.300399 1.182421 1.780277
1437 1 0 1 0 0 0 0 0 1 0 ... 2.653309 1.153254 0.288444 0.0 -1.018674 1.685851 -1.077862 0.041137 -1.219103 -0.299713
1443 0 0 1 0 0 0 0 0 1 0 ... 2.653309 0.246200 0.288444 0.0 -1.018674 2.161410 -1.077862 0.300399 0.582040 2.968843
1461 0 0 0 1 1 0 0 0 0 0 ... 0.845911 -1.567907 1.648767 0.0 0.510149 1.527331 0.338096 -0.730350 -0.618722 -1.191138
1462 0 0 0 1 1 0 0 0 0 0 ... 1.749610 1.153254 2.016746 0.0 0.510149 1.685851 -1.077862 0.300399 1.482611 0.591711

233 rows × 29 columns

In [ ]:
# cluster=2
In [82]:
index_2 = np.where(C_var==2)
index_2
Out[82]:
(array([   5,    9,   11,   15,   22,   27,   43,   55,   59,   61,   64,
          73,   76,   80,   83,   88,   89,   92,   94,  103,  116,  117,
         119,  121,  124,  129,  137,  139,  144,  150,  151,  153,  154,
         155,  158,  166,  167,  172,  173,  185,  188,  195,  199,  201,
         210,  212,  220,  222,  223,  226,  227,  228,  243,  247,  256,
         261,  269,  282,  283,  285,  291,  293,  297,  303,  304,  305,
         306,  311,  315,  317,  319,  321,  322,  324,  334,  338,  339,
         341,  342,  343,  344,  353,  355,  359,  361,  366,  384,  386,
         394,  398,  403,  412,  423,  426,  432,  442,  444,  446,  447,
         452,  459,  462,  467,  468,  472,  491,  503,  506,  508,  514,
         519,  523,  524,  526,  527,  529,  530,  531,  532,  533,  551,
         554,  558,  562,  564,  569,  578,  590,  593,  600,  604,  607,
         608,  610,  621,  635,  636,  641,  647,  652,  658,  664,  675,
         681,  685,  686,  690,  692,  693,  696,  702,  704,  705,  707,
         708,  710,  718,  729,  730,  733,  745,  749,  751,  752,  756,
         757,  760,  768,  773,  779,  780,  783,  784,  788,  796,  805,
         807,  809,  825,  836,  837,  844,  846,  852,  855,  870,  872,
         873,  874,  879,  881,  883,  888,  889,  891,  896,  900,  908,
         920,  923,  927,  928,  930,  932,  935,  941,  942,  943,  944,
         947,  949,  950,  951,  958,  959,  960,  963,  964,  968,  969,
         979,  983,  985,  995,  997, 1005, 1007, 1018, 1029, 1030, 1033,
        1040, 1048, 1050, 1055, 1063, 1073, 1081, 1084, 1085, 1089, 1090,
        1094, 1095, 1099, 1103, 1106, 1114, 1119, 1122, 1124, 1130, 1131,
        1142, 1143, 1146, 1147, 1148, 1149, 1150, 1155, 1156, 1157, 1159,
        1160, 1162, 1163, 1165, 1174, 1179, 1181, 1187, 1188, 1190, 1204,
        1208, 1210, 1212, 1214, 1216, 1218, 1220, 1231, 1232, 1240, 1244,
        1251, 1253, 1260, 1265, 1267, 1269, 1274, 1278, 1280, 1281, 1282,
        1288, 1289, 1295, 1296, 1304, 1318, 1322, 1330, 1334, 1340, 1341,
        1346, 1350, 1357, 1361, 1363, 1364, 1368, 1373, 1385, 1386, 1392,
        1393, 1395, 1399, 1404, 1409, 1410, 1412, 1416, 1418, 1421, 1424,
        1425, 1429, 1430, 1431, 1434, 1439, 1444, 1450, 1451, 1463, 1466,
        1468]),)
In [83]:
cluster_2 = final_log.iloc[index_2]
cluster_2
Out[83]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently Department_Research & Development Department_Sales EducationField_Marketing EducationField_Technical Degree JobRole_Healthcare Representative JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director ... JobLevel JobSatisfaction MonthlyIncome StandardHours StockOptionLevel TotalWorkingYears WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsWithCurrManager
5 0 1 1 0 0 0 0 1 0 0 ... -0.961486 1.153254 -0.785457 0.0 -1.018674 -0.374906 -1.077862 0.298300 0.882230 0.591711
9 0 0 1 0 0 0 1 0 0 0 ... -0.057788 0.246200 -0.107338 0.0 2.038972 1.051772 -1.077862 0.298300 0.882230 0.888852
11 0 0 1 0 0 0 0 1 0 0 ... -0.057788 0.246200 -0.433736 0.0 -1.018674 -0.057867 0.338096 0.812625 0.281849 1.185994
15 0 0 1 0 0 0 0 0 0 1 ... 0.845911 -1.567907 1.375519 0.0 0.510149 -0.057867 0.338096 1.069787 1.482611 1.185994
22 0 0 1 0 0 0 0 0 0 0 ... 0.845911 -0.660853 2.005178 0.0 -1.018674 0.417693 0.338096 1.584112 0.582040 2.077418
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1450 0 0 0 0 0 0 0 0 0 0 ... 0.845911 1.153254 1.018170 0.0 -1.018674 -0.216386 0.338096 0.812625 -1.219103 0.888852
1451 0 0 0 1 0 0 0 0 0 0 ... -0.057788 1.153254 -0.074198 0.0 0.510149 -0.057867 0.338096 1.069787 0.882230 1.483135
1463 1 0 1 0 0 0 0 0 0 1 ... -0.057788 -1.567907 1.361762 0.0 -1.018674 -0.057867 0.338096 0.812625 -0.018341 0.888852
1466 0 0 1 0 0 0 1 0 0 0 ... 0.845911 -1.567907 1.378958 0.0 0.510149 -0.216386 0.338096 0.298300 0.882230 0.888852
1468 0 1 0 1 0 0 0 0 0 0 ... -0.057788 -0.660853 -0.059504 0.0 -1.018674 1.051772 -1.077862 0.812625 0.582040 1.185994

342 rows × 29 columns

In [ ]:
# Clusters = 3
In [84]:
index_3 = np.where(C_var==3)
index_3
Out[84]:
(array([   0,    2,   14,   24,   33,   34,   36,   42,   50,   51,   69,
         102,  107,  122,  127,  132,  171,  177,  192,  204,  214,  216,
         217,  229,  234,  239,  250,  259,  264,  288,  296,  323,  327,
         336,  357,  363,  368,  370,  378,  385,  405,  414,  415,  421,
         422,  436,  439,  440,  443,  453,  457,  463,  469,  479,  480,
         482,  495,  504,  513,  525,  528,  540,  547,  566,  573,  585,
         589,  591,  598,  614,  645,  656,  660,  662,  663,  666,  667,
         683,  688,  689,  709,  711,  720,  731,  732,  744,  748,  761,
         762,  776,  777,  797,  798,  800,  801,  828,  829,  831,  842,
         849,  857,  860,  864,  871,  892,  911,  915,  939,  940,  946,
         952,  953,  980,  981, 1012, 1016, 1021, 1032, 1036, 1039, 1056,
        1057, 1060, 1068, 1077, 1083, 1110, 1112, 1136, 1153, 1167, 1171,
        1201, 1205, 1213, 1222, 1236, 1237, 1246, 1249, 1255, 1257, 1262,
        1271, 1273, 1279, 1290, 1291, 1297, 1312, 1313, 1326, 1332, 1338,
        1339, 1354, 1365, 1369, 1375, 1379, 1390, 1438, 1442, 1452]),)
In [ ]:
# cluster=4
In [85]:
index_4 =np.where(C_var==4)
index_4
Out[85]:
(array([   1,    6,    7,    8,   21,   26,   44,   46,   47,   48,   52,
          56,   58,   60,   71,   75,   79,   87,   91,   96,  100,  105,
         110,  111,  118,  120,  134,  135,  140,  142,  148,  152,  157,
         160,  168,  169,  178,  180,  182,  194,  205,  211,  221,  236,
         251,  260,  266,  272,  277,  278,  281,  286,  289,  325,  335,
         352,  356,  362,  364,  375,  382,  391,  395,  407,  410,  416,
         418,  434,  445,  451,  456,  475,  476,  484,  498,  501,  502,
         509,  517,  518,  521,  537,  545,  553,  563,  567,  581,  582,
         587,  594,  602,  609,  611,  612,  619,  627,  630,  631,  633,
         651,  661,  669,  676,  682,  687,  700,  714,  725,  740,  746,
         764,  778,  785,  791,  792,  804,  808,  817,  834,  841,  843,
         847,  853,  861,  867,  875,  880,  882,  886,  910,  912,  918,
         936,  948,  967,  974,  977,  978,  992, 1006, 1009, 1023, 1053,
        1058, 1064, 1071, 1075, 1087, 1092, 1100, 1102, 1116, 1118, 1129,
        1151, 1166, 1175, 1183, 1184, 1185, 1186, 1193, 1198, 1206, 1209,
        1221, 1239, 1243, 1263, 1266, 1284, 1298, 1300, 1307, 1308, 1314,
        1315, 1316, 1320, 1328, 1333, 1336, 1344, 1347, 1353, 1356, 1372,
        1383, 1384, 1389, 1394, 1398, 1405, 1414, 1420, 1432, 1433, 1441,
        1445, 1446, 1447, 1467]),)
In [86]:
cluster_4 = final_log.iloc[index_4]
cluster_4
Out[86]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently Department_Research & Development Department_Sales EducationField_Marketing EducationField_Technical Degree JobRole_Healthcare Representative JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director ... JobLevel JobSatisfaction MonthlyIncome StandardHours StockOptionLevel TotalWorkingYears WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsWithCurrManager
1 0 1 1 0 0 0 0 0 0 0 ... -0.057788 -0.660853 -0.140791 0.0 0.510149 -0.057867 0.338096 1.069787 0.882230 0.888852
6 0 0 1 0 0 0 0 1 0 0 ... -0.961486 -1.567907 -0.909888 0.0 0.195024 0.259173 -1.077862 -1.244675 -1.219103 -1.191138
7 0 0 1 0 0 0 0 1 0 0 ... -0.961486 0.246200 -0.902697 0.0 0.510149 -1.484545 0.338096 -1.244675 -1.219103 -1.191138
8 0 1 1 0 0 0 0 0 0 1 ... 0.845911 0.246200 1.233580 0.0 -1.018674 -0.057867 0.338096 0.812625 0.882230 1.185994
21 0 0 0 1 0 0 0 0 0 0 ... -0.961486 -1.567907 -0.679471 0.0 -1.018674 -0.057867 0.338096 -0.216025 -0.318532 -0.299713
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1441 1 0 1 0 0 0 1 0 0 0 ... -0.057788 0.246200 0.226875 0.0 0.510149 0.417693 -1.077862 1.841275 2.383182 1.483135
1445 0 0 1 0 0 0 0 0 0 1 ... 1.749610 -0.660853 2.497900 0.0 0.510149 1.685851 0.338096 0.300399 0.882230 1.780277
1446 0 0 0 1 1 0 0 0 0 0 ... -0.057788 0.246200 0.353807 0.0 2.038972 -0.374906 0.338096 0.555462 0.882230 0.888852
1447 1 0 0 1 1 0 0 0 0 0 ... -0.057788 1.153254 -0.054502 0.0 0.510149 0.734732 -1.077862 2.355600 2.383182 2.077418
1467 0 0 1 0 0 0 0 0 0 1 ... -0.057788 -0.660853 0.175602 0.0 0.510149 -0.691946 0.338096 0.041137 -0.618722 -0.299713

202 rows × 29 columns

In [87]:
WCSS_list1 = []
for k in range(1,10):
    kmeans_model = KMeans(n_clusters=k)  
    kmeans_model.fit(x_log)
    WCSS = kmeans_model.inertia_
    WCSS_list1.append(WCSS)
    print('K >>',k,'WCSS>>',WCSS)
K >> 1 WCSS>> 36749.99999999997
K >> 2 WCSS>> 32432.28022057883
K >> 3 WCSS>> 30728.82109411982
K >> 4 WCSS>> 29482.758915733095
K >> 5 WCSS>> 28490.18203277078
K >> 6 WCSS>> 27776.60942323769
K >> 7 WCSS>> 27144.416717879314
K >> 8 WCSS>> 26649.96132712636
K >> 9 WCSS>> 26359.154762882907
In [88]:
WCSS
Out[88]:
26359.154762882907
In [ ]:
# Elbow_method
In [89]:
k = range(1,10)
plt.plot(k,WCSS_list1)
plt.xlabel('Number of clusters(K)')
plt.ylabel('WCSS')
plt.title('Elbow method graph')
Out[89]:
Text(0.5, 1.0, 'Elbow method graph')
In [ ]:
# Predicted = 2
In [90]:
kmeans_model = KMeans(n_clusters=2)
kmeans_model.fit(x_log)
Out[90]:
KMeans(n_clusters=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=2)
In [91]:
kmeans_model = KMeans(n_clusters=2)  # n_clusters=8  >> default
pred_k=kmeans_model.fit_predict(x_log)
In [92]:
pred_k
Out[92]:
array([1, 0, 1, ..., 1, 0, 1], dtype=int32)
In [93]:
final_log
Out[93]:
BusinessTravel_Non-Travel BusinessTravel_Travel_Frequently Department_Research & Development Department_Sales EducationField_Marketing EducationField_Technical Degree JobRole_Healthcare Representative JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director ... JobLevel JobSatisfaction MonthlyIncome StandardHours StockOptionLevel TotalWorkingYears WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsWithCurrManager
0 0 0 0 1 0 0 0 0 0 0 ... -0.057788 1.153254 0.129018 0.0 -1.018674 -0.374906 -2.493820 0.041137 -0.018341 0.294570
1 0 1 1 0 0 0 0 0 0 0 ... -0.057788 -0.660853 -0.140791 0.0 0.510149 -0.057867 0.338096 1.069787 0.882230 0.888852
2 0 0 1 0 0 0 0 1 0 0 ... -0.961486 0.246200 -1.091220 0.0 -1.018674 -0.533426 0.338096 -1.501837 -1.219103 -1.191138
3 0 1 1 0 0 0 0 0 0 0 ... -0.961486 0.246200 -0.835167 0.0 -1.018674 -0.374906 0.338096 0.555462 0.882230 -1.191138
4 0 0 1 0 0 0 0 1 0 0 ... -0.961486 -0.660853 -0.660400 0.0 0.510149 -0.691946 0.338096 -0.987512 -0.618722 -0.596855
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 0 1 1 0 0 0 0 1 0 0 ... -0.057788 1.153254 -0.940839 0.0 0.510149 1.051772 0.338096 -0.216025 -0.618722 -0.299713
1466 0 0 1 0 0 0 1 0 0 0 ... 0.845911 -1.567907 1.378958 0.0 0.510149 -0.216386 0.338096 0.298300 0.882230 0.888852
1467 0 0 1 0 0 0 0 0 0 1 ... -0.057788 -0.660853 0.175602 0.0 0.510149 -0.691946 0.338096 0.041137 -0.618722 -0.299713
1468 0 1 0 1 0 0 0 0 0 0 ... -0.057788 -0.660853 -0.059504 0.0 -1.018674 1.051772 -1.077862 0.812625 0.582040 1.185994
1469 0 0 1 0 0 0 0 1 0 0 ... -0.057788 0.246200 -0.367768 0.0 -1.018674 -0.691946 1.754054 -0.473188 -0.318532 -0.596855

1470 rows × 29 columns

Principle Component analysis¶

In [94]:
from sklearn.decomposition import PCA
In [95]:
pca = PCA(n_components=2)
In [96]:
pca.fit(x_log)
Out[96]:
PCA(n_components=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA(n_components=2)
In [97]:
x_pca = pca.transform(x_log)
In [98]:
x_pca
Out[98]:
array([[-0.49360165,  1.41619175],
       [ 1.15378029, -1.55229841],
       [-3.03698632,  1.36126884],
       ...,
       [-0.98841934, -1.0507973 ],
       [ 1.47493103,  0.30399305],
       [-1.06652057,  0.13379506]])
In [ ]:
# normalization of pca due to noise in clustering
In [99]:
x_pca_scaler = StandardScaler().fit(x_pca)
In [100]:
x_pca1 = x_pca_scaler.transform(x_pca)
In [101]:
x_pca1
Out[101]:
array([[-0.24458175,  1.0159896 ],
       [ 0.57170313, -1.11363383],
       [-1.50483988,  0.97658738],
       ...,
       [-0.48976606, -0.7538521 ],
       [ 0.73083471,  0.21808754],
       [-0.52846556,  0.09598586]])
In [102]:
H = pd.DataFrame(x_pca1)
H
Out[102]:
0 1
0 -0.244582 1.015990
1 0.571703 -1.113634
2 -1.504840 0.976587
3 -0.257197 -0.558017
4 -0.981771 -0.244826
... ... ...
1465 -0.239128 0.387034
1466 0.702694 0.004628
1467 -0.489766 -0.753852
1468 0.730835 0.218088
1469 -0.528466 0.095986

1470 rows × 2 columns

In [103]:
L = H.rename({0:'X1',1:'Y1'},axis=1)
In [104]:
L
Out[104]:
X1 Y1
0 -0.244582 1.015990
1 0.571703 -1.113634
2 -1.504840 0.976587
3 -0.257197 -0.558017
4 -0.981771 -0.244826
... ... ...
1465 -0.239128 0.387034
1466 0.702694 0.004628
1467 -0.489766 -0.753852
1468 0.730835 0.218088
1469 -0.528466 0.095986

1470 rows × 2 columns

In [ ]:
# Visualaisation of Clustering
In [105]:
plt.scatter(L['X1'],L['Y1'],c=pred_k,s=30)
plt.title("visualisation of K-means")
plt.xlabel("X-AXIS")
plt.ylabel("Y-AXIS")
plt.show()
In [106]:
from sklearn.metrics import silhouette_score
In [107]:
silhouette_score(L,pred_k)
Out[107]:
0.3733471110993441

2. Heirarchical Clustering¶

In [ ]:
# Dendrogram
In [108]:
import scipy.cluster.hierarchy as sch
plt.figure(figsize=(10,9))

dendrogram = sch.dendrogram(sch.linkage(L,method='ward'))
plt.title('dendrogram')
plt.xlabel('X-AXIS')
plt.ylabel('Euclidean distances')
plt.show()
In [109]:
from sklearn.cluster import AgglomerativeClustering
cluster = AgglomerativeClustering(n_clusters=2,affinity = 'euclidean',linkage='ward')
In [110]:
cluster.fit_predict(L)
Out[110]:
array([0, 1, 0, ..., 0, 1, 0])
In [111]:
XC = cluster.fit_predict(L)
In [112]:
from sklearn.metrics import silhouette_score
In [113]:
silhouette_score(L,XC)
Out[113]:
0.38182922187276147
In [ ]:
# Visualisation of clustering 
In [114]:
plt.scatter(L['X1'],L['Y1'],c=XC,s=30)
plt.title("Visualisation of Heirarchical clustering")
plt.xlabel("X-AXIS")
plt.ylabel("Y-AXIS")
plt.show()

3] DBSCAN Clustering¶

In [115]:
from sklearn.cluster import DBSCAN
In [116]:
A = DBSCAN (eps=3.6,min_samples=8)
In [117]:
labels = A.fit_predict(x_log)
np.unique(labels)
Out[117]:
array([-1,  0,  1,  2])
In [118]:
labels = A.fit_predict(x_log)
In [119]:
np.unique(labels)
Out[119]:
array([-1,  0,  1,  2])
In [ ]:
# VISUALISATION OF CLUSTER 
In [120]:
plt.scatter(L['X1'],L['Y1'],c=labels,s=30)
plt.xlabel("X-AXIS")
plt.ylabel("Y-AXIS")
plt.title("Visualisation of DBSCAN ")
plt.show()
In [121]:
silhouette_score(L,labels)
Out[121]:
-0.12086921127977733
In [ ]:
 
In [ ]: